def firststep(): year_month = {} byyearmonth = {} ngrams = {} raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) for speechid in raw_speeches: fulldate = speechid[0:10] if (fulldate >= "1792-06-10") and (fulldate <= "1793-08-02"): speech_bigrams = compute_ngrams(raw_speeches[speechid], 2) yearmonth = speechid[0:7] print yearmonth if yearmonth in byyearmonth: byyearmonth[ yearmonth] = byyearmonth[yearmonth] + speech_bigrams else: byyearmonth[yearmonth] = speech_bigrams speech_bigrams = None #year_month[speechid] = yearmonth #byyearmonth[speechid] = speech_bigrams print "here" with open("byyearmonth.pickle", "wb") as handle: pickle.dump(byyearmonth, handle, protocol=0) w = csv.writer(open("byyearmonth.csv", "w")) for key, val in byspeaker.items(): w.writerow([key, val]) """byyearmonth = pd.DataFrame.from_dict(byyearmonth, orient = "index")
def aggregate_by_speaker(speakers_to_analyze, raw_speeches, speechid_to_speaker): speaker_names = set() speakers_to_consider = [] num_speeches = 0 for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker_name in speakers_to_consider: print speaker_name speech = "" for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (speaker_name == speechid_to_speaker[identity]): num_speeches = num_speeches + 1 add_to_docfreq(dict(compute_ngrams(speech))) speech = speech + " " + raw_speeches[identity] speaker_ngrams = compute_ngrams(speech) pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(speaker_ngrams, handle, protocol = 0)
def create_tfidf_vectors(dataframe): speeches = dataframe['concat_speeches'].tolist() ngrams = [] for unit in speeches: ngrams.append(compute_ngrams(unit, 2)) ngrams_to_add = pd.Series(ngrams) dataframe['ngrams'] = ngrams_to_add.values tfidf = [] for element in ngrams: tfidf.append(compute_tfidf(element, num_speeches, doc_freq)) tfidf_to_add = pd.Series(tfidf) dataframe['tfidf'] = tfidf_to_add.values return dataframe
def calculate_chronology(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, Girondins, Montagnards): speaker_ngrams = {} speakers_to_consider = [] speaker_distances = collections.defaultdict() chronology = collections.defaultdict(dict) for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) row_entry_speechid = [] row_entry_date = [] for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] speaker_name = speechid_to_speaker[identity] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name in speakers_to_consider): party = speakers_to_analyze.loc[speaker_name, "Party"] indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) # Store relevant information for each bigram for bigram in indv_speech_bigram: row_entry_speechid.append([ str(bigram), speaker_name, identity, indv_speech_bigram[bigram], party ]) row_entry_date.append([ str(bigram), speaker_name, date, indv_speech_bigram[bigram], party ]) chronology_speechid = pd.DataFrame(row_entry_speechid, columns=[ "Bigram", "Speaker Name", "Speechid", "Num occurrences", "Party" ]) chronology_date = pd.DataFrame( row_entry_date, columns=["Bigram", "Speaker Name", "Date", "Num occurrences", "Party"]) # w = csv.writer(open("chronology.csv", "w")) # for key, val in chronology.items(): # if (Girondins[key] >= 10) or (Montagnards[key] >= 10): # w.writerow([key,val]) make_visualizations(chronology_date) write_to_excel(chronology_speechid, "chronology_speechid.xlsx") write_to_excel(chronology_date, "chronology_date.xlsx") store_to_pickle(chronology_speechid, "chronology_speechid.pickle") store_to_pickle(chronology_date, "chronology_date.pickle")
def firststep(): byspeaker = {} speakerdict = {} byspeaker_allspeakers = {} speakerdict_allspeakers = {} ngrams = {} speakers_to_consider = [] raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) # dataframe = pd.DataFrame.from_dict(raw_speeches, orient = "index") # dataframe.columns = ['Speeches'] speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb")) # file = open('num_speeches.txt', 'r') # num_speeches = int(file.read()) # doc_freq = pickle.load(open("bigram_doc_freq.pickle", "rb")) speakers_to_analyze = load_list("Girondins and Montagnards New Mod.xlsx") for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speechid in raw_speeches: fulldate = speechid[0:10] if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"): speech_bigrams = compute_ngrams(raw_speeches[speechid], 2) speaker = speechid_to_speaker[speechid] print speaker if speaker in byspeaker_allspeakers: byspeaker_allspeakers[ speaker] = byspeaker_allspeakers[speaker] + speech_bigrams else: byspeaker_allspeakers[speaker] = speech_bigrams speech_bigrams = None with open("byspeaker_allspeakers.pickle", "wb") as handle: pickle.dump(byspeaker_allspeakers, handle, protocol=0) w = csv.writer(open("byspeaker_allspeakers.csv", "w")) for key, val in byspeaker.items(): w.writerow([key, val]) """byspeaker_allspeakers = pd.DataFrame.from_dict(byspeaker_allspeakers, orient = "index")
def aggregate_by_speaker(): byspeaker = {} speakerdict = {} ngrams = {} speakers_to_consider = [] raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb")) speakers_to_analyze = load_list( "Girondins and Montagnards New Mod Limit.xlsx") speaker_num_words = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speechid in raw_speeches: fulldate = speechid[0:10] if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"): num_words = len(raw_speeches[speechid].split()) speech_bigrams = compute_ngrams(raw_speeches[speechid], 2) speaker = speechid_to_speaker[speechid] if speaker in speaker_num_words: speaker_num_words[speaker] += num_words else: speaker_num_words[speaker] = num_words if speaker in speakers_to_consider: if speaker in byspeaker: byspeaker[speaker] = byspeaker[speaker] + speech_bigrams else: byspeaker[speaker] = speech_bigrams speech_bigrams = None write_to_csv(byspeaker) store_to_pickle(byspeaker) write_to_csv(speaker_num_words) store_to_pickle(speaker_num_words)
def aggregate_by_speaker(speakers_to_analyze, raw_speeches, speechid_to_speaker): speaker_names = set() speakers_to_consider = [] gir_num_speeches = 0 mont_num_speeches = 0 bigrams_speeches = collections.defaultdict() for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (speaker_name == speechid_to_speaker[identity]): indv_speech_ngram = compute_ngrams(raw_speeches[identity]) for bigram in indv_speech_ngram: if bigram in bigrams_speeches: bigrams_speeches[bigram].append(identity) else: bigrams_speeches[bigram] = [] bigrams_speeches[bigram].append(identity) if party == "Girondins": gir_num_speeches += 1 else: mont_num_speeches += 1 speech = speech + indv_speech_ngram #speaker_ngrams = compute_ngrams(speech) pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(speech, handle, protocol=0) with open('bigrams_to_speeches.csv', 'wb') as outfile: writer = csv.writer(outfile) for key, val in bigrams_speeches.items(): writer.writerow([key, val]) print gir_num_speeches print mont_num_speeches
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker): speakers_to_consider = [] speaker_bigram_frequencies = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker_name in speakers_to_consider: print speaker_name speaker_bigram_frequencies = {} party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name == speechid_to_speaker[identity]): indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: if bigram in speaker_bigram_frequencies: #speechid_frequencies = speaker_bigram_frequencies[bigram] #speechid_frequencies[speechid] = indv_speech_bigram[bigram] speaker_bigram_frequencies[bigram][ identity] = indv_speech_bigram[bigram] else: speaker_bigram_frequencies[bigram] = {} speaker_bigram_frequencies[bigram][ identity] = indv_speech_bigram[bigram] filename_pickle = "" + speaker_name + "bigram_frequencies.pickle" with open(filename_pickle, 'wb') as handle: pickle.dump(speaker_bigram_frequencies, handle, protocol=0) filename_csv = "" + speaker_name + "bigram_frequencies.csv" w = csv.writer(open(filename_csv, "w")) for key, val in speaker_bigram_frequencies.items(): w.writerow([key, val])
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker): speaker_names = set() speakers_to_consider = [] # Initialize various data frames for export to the classification script train_total_freq_unigram = {} test_total_freq_unigram = {} train_total_freq_bigram = {} test_total_freq_bigram = {} train_number_speeches = 0 test_number_speeches = 0 # Keeps track of which speeches contain the given bigram train_speeches_bigram = collections.defaultdict(dict) test_speeches_bigram = collections.defaultdict(dict) train_speeches_unigram = collections.defaultdict(dict) test_speeches_unigram = collections.defaultdict(dict) bigrams_to_speeches = collections.defaultdict() bigram_doc_freq = collections.defaultdict() unigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() speech_num = 0 for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name == speechid_to_speaker[identity]): # Only looking at speeches with substance, so greater than 100 characters if len(raw_speeches[identity]) >= 100: indv_speech_bigram = compute_ngrams( raw_speeches[identity], 2) indv_speech_unigram = compute_ngrams( raw_speeches[identity], 1) # Splitting the data into training and test data with 1/4 of each speaker's data in the test set if speech_num % 4 != 0: train_number_speeches += 1 for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) augment(train_total_freq_bigram, bigram) for unigram in indv_speech_unigram: augment(unigram_doc_freq, unigram) augment(train_total_freq_unigram, unigram) train_speeches_bigram[identity] = indv_speech_bigram train_speeches_unigram[identity] = indv_speech_unigram else: test_number_speeches += 1 for bigram in indv_speech_bigram: augment(test_total_freq_bigram, bigram) for unigram in indv_speech_unigram: augment(test_total_freq_unigram, unigram) test_speeches_bigram[identity] = indv_speech_bigram test_speeches_unigram[identity] = indv_speech_unigram speech_num += 1 # Write all relevant data objects and values to memory to use when running classification with open("speechid_to_speaker_store.pickle", 'wb') as handle: pickle.dump(speechid_to_speaker, handle, protocol=0) speechid_to_speaker = None with open("speakers_to_analyze_store.pickle", 'wb') as handle: pickle.dump(speakers_to_analyze, handle, protocol=0) speakers_to_analyze = None raw_speeches = None with open("train_speeches_bigram.pickle", 'wb') as handle: pickle.dump(train_speeches_bigram, handle, protocol=0) with open("train_speeches_unigram.pickle", 'wb') as handle: pickle.dump(train_speeches_unigram, handle, protocol=0) with open("train_total_freq_bigram.pickle", 'wb') as handle: pickle.dump(train_total_freq_bigram, handle, protocol=0) with open("train_total_freq_unigram.pickle", 'wb') as handle: pickle.dump(train_total_freq_unigram, handle, protocol=0) with open("bigram_doc_freq.pickle", 'wb') as handle: pickle.dump(bigram_doc_freq, handle, protocol=0) with open("unigram_doc_freq.pickle", 'wb') as handle: pickle.dump(unigram_doc_freq, handle, protocol=0) with open("train_number_speeches.pickle", 'wb') as handle: pickle.dump(train_number_speeches, handle, protocol=0) with open("test_speeches_bigram.pickle", 'wb') as handle: pickle.dump(test_speeches_bigram, handle, protocol=0) with open("test_speeches_unigram.pickle", 'wb') as handle: pickle.dump(test_speeches_unigram, handle, protocol=0) with open("test_total_freq_bigram.pickle", 'wb') as handle: pickle.dump(test_total_freq_bigram, handle, protocol=0) with open("test_total_freq_unigram.pickle", 'wb') as handle: pickle.dump(test_total_freq_unigram, handle, protocol=0)
def calculate_chronology(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, Girondins, Montagnards): speaker_ngrams = {} speakers_to_consider = [] speaker_distances = collections.defaultdict() chronology = collections.defaultdict(dict) # chronology_date = pd.DataFrame(columns = ["Bigram", "Speaker Name", "Date", "Num occurrences"]) # chronology_speechid = pd.DataFrame(columns = ["Bigram", "Speaker Name", "Speechid", "Num occurrences"]) for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) row_entry_speechid = [] row_entry_date = [] for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] speaker_name = speechid_to_speaker[identity] # print speaker_name if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name in speakers_to_consider): party = speakers_to_analyze.loc[speaker_name, "Party"] indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: row_entry_speechid.append([ str(bigram), speaker_name, identity, indv_speech_bigram[bigram], party ]) # chronology_speechid = chronology_speechid.append(pd.Series(row_entry_speechid), ignore_index = True) row_entry_date.append([ str(bigram), speaker_name, date, indv_speech_bigram[bigram], party ]) # chronology_date = chronology_date.append(pd.Series(row_entry_date), ignore_index = True) # if bigram in chronology: # chronology[bigram].append([speaker_name, identity, indv_speech_bigram[bigram]]) # else: # chronology[bigram] = [] # chronology[bigram].append([speaker_name, identity, indv_speech_bigram[bigram]]) chronology_speechid = pd.DataFrame(row_entry_speechid, columns=[ "Bigram", "Speaker Name", "Speechid", "Num occurrences", "Party" ]) chronology_date = pd.DataFrame( row_entry_date, columns=["Bigram", "Speaker Name", "Date", "Num occurrences", "Party"]) # Create ngram column, speaker name, date, number of occurrences # Create two dataframes, one with date and one with speechid # Include volume number # Do groupby and aggregation methods # w = csv.writer(open("chronology.csv", "w")) # for key, val in chronology.items(): # if (Girondins[key] >= 10) or (Montagnards[key] >= 10): # w.writerow([key,val]) make_visualizations(chronology_date) # write_to = pd.ExcelWriter("chronology_speechid.xlsx") # chronology_speechid.to_excel(write_to, 'Sheet1') # write_to.save() # filename = pd.ExcelWriter("chronology_date.xlsx") # chronology_date.to_excel(write_to, 'Sheet1') # filename.save() pickle_filename_2 = "chronology_speechid.pickle" with open(pickle_filename_2, 'wb') as handle: pickle.dump(chronology_speechid, handle, protocol=0) pickle_filename = "chronology_date.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(chronology_date, handle, protocol=0)
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker): # Dataframe to keep track of the speakers we care about speakers_to_consider = [] # Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches # and speechid_to_speaker for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) # Initialize various data frames for export to the classification script train_total_freq_unigram = {} test_total_freq_unigram = {} train_total_freq_bigram = {} test_total_freq_bigram = {} train_number_speeches = 0 test_number_speeches = 0 # Keeps track of which speeches contain the given bigram train_speeches_bigram = collections.defaultdict(dict) test_speeches_bigram = collections.defaultdict(dict) train_speeches_unigram = collections.defaultdict(dict) test_speeches_unigram = collections.defaultdict(dict) bigrams_to_speeches = collections.defaultdict() bigram_doc_freq = collections.defaultdict() unigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() # Variable to keep track of a given speaker's number of speeches speech_num = 0 for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] # Only look at speeches within the date frame and that are from the speaker of interest if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]): # Only looking at speeches with substance, so greater than 100 characters if len(raw_speeches[identity]) >= 100: indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) indv_speech_unigram = compute_ngrams(raw_speeches[identity], 1) # Splitting the data into training and test data with 1/4 of each speaker's data in the test set if speech_num%4 != 0: train_number_speeches += 1 for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) augment(train_total_freq_bigram, bigram) for unigram in indv_speech_unigram: augment(unigram_doc_freq, unigram) augment(train_total_freq_unigram, unigram) train_speeches_bigram[identity] = indv_speech_bigram train_speeches_unigram[identity] = indv_speech_unigram else: test_number_speeches += 1 for bigram in indv_speech_bigram: augment(test_total_freq_bigram, bigram) for unigram in indv_speech_unigram: augment(test_total_freq_unigram, unigram) test_speeches_bigram[identity] = indv_speech_bigram test_speeches_unigram[identity] = indv_speech_unigram speech_num += 1 # Write all relevant data objects and values to memory to use when running classification store_to_pickle(speakers_to_analyze, "speakers_to_analyze.pickle") # Set these dataframes to None to conserve memory speakers_to_analyze = None speechid_to_speaker = None raw_speeches = None store_to_pickle(train_speeches_bigram, "train_speeches_bigram.pickle") store_to_pickle(train_speeches_unigram, "train_speeches_unigram.pickle") store_to_pickle(train_total_freq_bigram, "train_total_freq_bigram.pickle") store_to_pickle(train_total_freq_unigram, "train_total_freq_unigram.pickle") store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle") store_to_pickle(unigram_doc_freq, "unigram_doc_freq.pickle") store_to_pickle(train_number_speeches, "train_number_speeches.pickle") store_to_pickle(test_speeches_bigram, "test_speeches_bigram.pickle") store_to_pickle(test_speeches_unigram, "test_speeches_unigram.pickle") store_to_pickle(test_total_freq_bigram, "test_total_freq_bigram.pickle") store_to_pickle(test_total_freq_unigram, "test_total_freq_unigram.pickle")
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards): speaker_num_speeches = {} speaker_char_count = {} # Dataframe to keep track of the speakers we care about speakers_to_consider = [] # Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches # and speechid_to_speaker for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) # Matches bigrams to the list of speakers and speeches that have that bigram bigrams_to_speeches = {} bigrams_to_speakers = {} # Maintains the number of documents a given bigram is spoken in for use with tf-idf bigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]): # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker # To potentially establish a cutoff for analysis purposes augment(speaker_num_speeches, speaker_name) if speaker_name in speaker_char_count: speaker_char_count[speaker_name] += len(raw_speeches[identity]) else: speaker_char_count[speaker_name] = len(raw_speeches[identity]) indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) # Maintains a list of speeches in which given bigrams are spoken in if bigram in bigrams_to_speeches: bigrams_to_speeches[bigram].append(identity) else: bigrams_to_speeches[bigram] = [] bigrams_to_speeches[bigram].append(identity) if bigram in bigrams_to_speakers: bigrams_to_speakers[bigram].add(speaker_name) else: bigrams_to_speakers[bigram] = set() bigrams_to_speakers[bigram].add(speaker_name) # Augments the relevant variables according to the party the speaker belongs to if party == "Girondins": gir_num_speeches += 1 gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs) try: Girondins = Girondins + indv_speech_bigram except NameError: Girondins = indv_speech_bigram else: mont_num_speeches += 1 mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs) try: Montagnards = Montagnards + indv_speech_bigram except NameError: Montagnards = indv_speech_bigram ### Maintains a Counter of all the bigrams and their counts for a given speaker # speech = speech + indv_speech_bigram ### Stores the bigram Counter object for each individual speaker # pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" # with open(pickle_filename, 'wb') as handle: # pickle.dump(speech, handle, protocol = 0) # Store raw counts store_to_pickle(Girondins,"Girondins.pickle") store_to_pickle(Montagnards, "Montagnards.pickle") # Store in memory aggregate information about each bigram bigram_aggregate_info(Girondins, Montagnards, bigrams_to_speakers, bigrams_to_speeches) ### If data has already been stored to memory, the lines below can be used # bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb")) # bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb")) # gir_docs = pickle.load(open("gir_docs.pickle", "rb")) # mont_docs = pickle.load(open("mont_docs.pickle", "rb")) # Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb")) # Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb")) # bigram_doc_freq = pickle.load(open("bigram_doc_freq.pickle", 'rb')) num_speeches = 4479 # Computes counts and tfidf scores for each party and outputs for further analysis in R counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq) """ EVERYTHING BELOW IS STORING DATA TO MEMORY """ # Stores the bigrams_to_speeches document in Excel df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index") write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx') df_bigrams_to_speakers = pd.DataFrame.from_dict(bigrams_to_speakers, orient = "index") write_to_excel(df_bigrams_to_speakers, 'bigrams_to_speakers.xlsx') df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient = "index") write_to_excel(df_doc_freq, 'doc_freq.xlsx') # Stores files in memory store_to_pickle(bigrams_to_speakers, "bigrams_to_speakers.pickle") store_to_pickle(bigrams_to_speeches, "bigrams_to_speeches.pickle") store_to_pickle(gir_docs, "gir_docs.pickle") store_to_pickle(mont_docs, "mont_docs.pickle") store_to_pickle(speaker_num_speeches, "speaker_num_speeches.pickle") store_to_pickle(speaker_char_count, "speaker_char_count.pickle") store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle") with open('gir_speeches.txt', 'w') as f: f.write('%d' % gir_num_speeches) with open('mont_speeches.txt', 'w') as f: f.write('%d' % mont_num_speeches) write_to_csv(speaker_num_speeches, "speaker_num_speeches.csv") write_to_csv(speaker_char_count, "speaker_char_count.csv") with open('num_speeches.txt', 'w') as f: f.write('%d' % num_speeches)
def aggregate(speakers_to_analyze_train, speakers_to_analyze_test, raw_speeches, speechid_to_speaker, Girondins, Montagnards): speaker_names = set() speakers_to_consider = [] train_total_freq_unigram = {} test_total_freq_unigram = {} train_total_freq_bigram = {} test_total_freq_bigram = {} train_number_speeches = 0 test_number_speeches = 0 # Keeps track of which speeches contain the given bigram train_speeches_bigram = collections.defaultdict(dict) test_speeches_bigram = collections.defaultdict(dict) train_speeches_unigram = collections.defaultdict(dict) test_speeches_unigram = collections.defaultdict(dict) ### Need to do all the following code for train and test for speaker in speakers_to_analyze_train.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker in speakers_to_analyze_test.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker_name in speakers_to_consider: print speaker_name if speaker_name in speakers_to_analyze_train.index.values: party = speakers_to_analyze_train.loc[speaker_name, "Party"] else: party = speakers_to_analyze_test.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name == speechid_to_speaker[identity]): indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) indv_speech_unigram = compute_ngrams(raw_speeches[identity], 1) if speaker_name in speakers_to_analyze_train.index.values: train_number_speeches += 1 for bigram in indv_speech_bigram: if bigram in bigram_doc_freq: bigram_doc_freq[ bigram] = bigram_doc_freq[bigram] + 1 else: bigram_doc_freq[bigram] = 1 if bigram in train_total_freq_bigram: train_total_freq_bigram[bigram] += 1 else: train_total_freq_bigram[bigram] = 1 for unigram in indv_speech_unigram: if unigram in unigram_doc_freq: unigram_doc_freq[ unigram] = unigram_doc_freq[unigram] + 1 else: unigram_doc_freq[unigram] = 1 if unigram in train_total_freq_unigram: train_total_freq_unigram[unigram] += 1 else: train_total_freq_unigram[unigram] = 1 train_speeches_bigram[identity] = indv_speech_bigram train_speeches_unigram[identity] = indv_speech_unigram else: test_number_speeches += 1 for bigram in indv_speech_bigram: if bigram in bigram_doc_freq: bigram_doc_freq[ bigram] = bigram_doc_freq[bigram] + 1 else: bigram_doc_freq[bigram] = 1 if bigram in test_total_freq_bigram: test_total_freq_bigram[bigram] += 1 else: test_total_freq_bigram[bigram] = 1 for unigram in indv_speech_unigram: if unigram in unigram_doc_freq: unigram_doc_freq[ unigram] = unigram_doc_freq[unigram] + 1 else: unigram_doc_freq[unigram] = 1 if unigram in test_total_freq_unigram: test_total_freq_unigram[unigram] += 1 else: test_total_freq_unigram[unigram] = 1 test_speeches_bigram[identity] = indv_speech_bigram test_speeches_unigram[identity] = indv_speech_unigram """if party == "Girondins": gir_num_speeches += 1 gir_doc_freq = check_num_speakers(indv_speech_ngram, speaker_name, gir_doc_freq) try: Girondins = Girondins + indv_speech_ngram except NameError: Girondins = indv_speech_ngram else: mont_num_speeches += 1 mont_doc_freq = check_num_speakers(indv_speech_ngram, speaker_name, mont_doc_freq) try: Montagnards = Montagnards + indv_speech_ngram except NameError: Montagnards = indv_speech_ngram""" #speech = speech + indv_speech_ngram #speaker_ngrams = compute_ngrams(speech) """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(speech, handle, protocol = 0)""" #NEED TO ADD CODE TO DO TRAINING AND TESTING SETS (import both excel files and do same computations) # Do unigrams as well classification = [] training_set = [] for speechid in train_speeches_bigram: speaker = speechid_to_speaker[speechid] if speakers_to_analyze_train.loc[speaker, "Party"] == "Girondins": classification.append(0) else: classification.append(1) # add some doc freq cutoff here bigram_input = { k: v for k, v in train_speeches_bigram[speechid].items() if (train_total_freq_bigram[k] >= 10) } unigram_input = { k: v for k, v in train_speeches_unigram[speechid].items() if (train_total_freq_unigram[k] >= 55) } bigram_scores = compute_tfidf(bigram_input, train_number_speeches, "bigram") unigram_scores = compute_tfidf(unigram_input, train_number_speeches, "unigram") merge_scores = bigram_scores.copy() merge_scores.update(unigram_scores) training_set.append(merge_scores) #training_set.append(bigram_scores) #training_set.append(unigram_scores) ### for if only doing unigrams #training_set.append(unigram_scores) """for speechid in train_speeches_unigram: speaker = speechid_to_speaker[speechid] if speakers_to_analyze_train.loc[speaker, "Party"] == "Girondins": classification.append(0) else: classification.append(1) # add some doc freq cutoff here unigram_input = {k:v for k,v in train_speeches_unigram[speechid].items() if (train_total_freq_unigram[k] >= 80)} scores = compute_tfidf(unigram_input, train_number_speeches, "unigram") training_set.append(scores)""" #party = pd.Series(classification) # loop through and count how many times each bigram appears, create new dataset that only has those bigrams # x is train.values and y is classification to pass into classifier, scikitlearn svm xgboost # key is to do feature engineering # 10 fold CV, start low re: features then work high and see if the score gets better train = pd.DataFrame(training_set) train = train.fillna(0) print train train, train_classification = data_clean( speechid_to_speaker, speakers_to_analyze_train, train_speeches_bigram, train_speeches_unigram, train_total_freq_bigram, train_total_freq_unigram, train_number_speeches) writer = pd.ExcelWriter("training_set.xlsx") train.to_excel(writer, 'Sheet1') writer.save() """test_classification = [] test_set = [] for speechid in test_speeches_bigram: speaker = speechid_to_speaker[speechid] if speakers_to_analyze_train.loc[speaker, "Party"] == "Girondins": test_classification.append(0) else: test_classification.append(1) # add some doc freq cutoff here bigram_input = {k:v for k,v in test_speeches_bigram[speechid].items() if (test_total_freq_bigram[k] >= 12)} unigram_input = {k:v for k,v in test_speeches_unigram[speechid].items() if (test_total_freq_unigram[k] >= 50)} bigram_scores = compute_tfidf(bigram_input, test_number_speeches, "bigram") unigram_scores = compute_tfidf(unigram_input, test_number_speeches, "unigram") merge_scores = bigram_scores.copy() merge_scores.update(unigram_scores) test_set.append(merge_scores) test = pd.DataFrame(test_set) test = test.fillna(0)""" logreg = LogisticRegression() logreg.fit(train.get_values(), classification) predicted = cross_validation.cross_val_predict(LogisticRegression(), train.get_values(), classification, cv=10) print metrics.accuracy_score(classification, predicted) #print logreg.score(test.get_values(), test_classification) # columns should be bigrams #print train """with open('bigrams_to_speeches.csv', 'wb') as outfile:
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards, Plein): speaker_names = set() speaker_num_speeches = {} speaker_char_count = {} speakers_to_consider = [] bigrams_to_speeches = {} bigrams_to_speakers = {} bigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} plein_docs = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name == speechid_to_speaker[identity]): # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker # To potentially establish a cutoff for analysis purposes augment(speaker_num_speeches, speaker_name) if speaker_name in speaker_char_count: speaker_char_count[speaker_name] += len( raw_speeches[identity]) else: speaker_char_count[speaker_name] = len( raw_speeches[identity]) indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) # Maintains a list of speeches in which given bigrams are spoken in if bigram in bigrams_to_speeches: bigrams_to_speeches[bigram].append(identity) else: bigrams_to_speeches[bigram] = [] bigrams_to_speeches[bigram].append(identity) if bigram in bigrams_to_speakers: bigrams_to_speakers[bigram].add(speaker_name) else: bigrams_to_speakers[bigram] = set() bigrams_to_speakers[bigram].add(speaker_name) # Augments the relevant variables according to the party the speaker belongs to if party == "Girondins": gir_num_speeches += 1 gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs) try: Girondins = Girondins + indv_speech_bigram except NameError: Girondins = indv_speech_bigram else: mont_num_speeches += 1 mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs) try: Montagnards = Montagnards + indv_speech_bigram except NameError: Montagnards = indv_speech_bigram #speech = speech + indv_speech_bigram # # Stores the bigram Counter object for each individual speaker # """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" # with open(pickle_filename, 'wb') as handle: # pickle.dump(speech, handle, protocol = 0)""" # Stores the bigrams_to_speeches document in Excel df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient="index") write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx') pickle_filename = "bigrams_to_speakers.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(bigrams_to_speakers, handle, protocol=0) pickle_filename = "bigrams_to_speeches.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(bigrams_to_speeches, handle, protocol=0) pickle_filename = "bigrams_to_speeches.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(bigrams_to_speeches, handle, protocol=0) pickle_filename = "gir_docs.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(gir_docs, handle, protocol=0) pickle_filename = "mont_docs.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(mont_docs, handle, protocol=0) # bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb")) # bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb")) # gir_docs = pickle.load(open("gir_docs.pickle", "rb")) # mont_docs = pickle.load(open("mont_docs.pickle", "rb")) # Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb")) # Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb")) bigram_num_speakers = [] bigram_num_speeches = [] bigram_total_freq = [] bg_speeches = {} bigrams = [] speeches = [] speakers = [] for bigram in bigrams_to_speeches: if (Girondins[bigram] >= 10) or (Montagnards[bigram] >= 10): bigram_num_speakers.append(len(bigrams_to_speakers[bigram])) bigram_num_speeches.append(len(bigrams_to_speeches[bigram])) bigram_total_freq.append(Girondins[bigram] + Montagnards[bigram]) bigrams.append(str(bigram)) speeches.append(str(bigrams_to_speeches[bigram])) speakers.append(str(bigrams_to_speakers[bigram])) bg_num_speakers = pd.DataFrame(bigram_num_speakers, columns=['Num Speakers']) bg_num_speeches = pd.DataFrame(bigram_num_speeches, columns=['Num Speeches']) bg_total_freq = pd.DataFrame(bigram_total_freq, columns=['Total count']) bgs = pd.DataFrame(bigrams, columns=["Bigram"]) speech = pd.DataFrame(speeches, columns=["Speechids"]) speaker = pd.DataFrame(speakers, columns=["Speakers"]) bigram_info = pd.DataFrame() bigram_info = pd.concat([ bgs, speech, speaker, bg_num_speeches, bg_num_speakers, bg_total_freq ], axis=1) writer = pd.ExcelWriter("bigram_info.xlsx") bigram_info.to_excel(writer, 'Sheet1') writer.save() w = csv.writer(open("bigrams_to_speeches_noplein.csv", "w")) for key, val in bigrams_to_speeches.items(): w.writerow([key, val]) bigrams_to_speakers_noplein_sorted = sorted(bigrams_to_speakers.items(), key=lambda x: len(x[1]), reverse=True) w = csv.writer(open("bigrams_to_speakers_noplein_sorted.csv", "w")) for item in bigrams_to_speakers_noplein_sorted: w.writerow([item[0], item[1]]) # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors # num_speeches = 4479 # bigram_doc_freq = pickle.load(open("bigram_doc_freq_noplein_withlimit.pickle", 'rb')) with open('gir_speeches_noplein_withlimit.txt', 'w') as f: f.write('%d' % gir_num_speeches) with open('mont_speeches_noplein_withlimit.txt', 'w') as f: f.write('%d' % mont_num_speeches) print num_speeches with open('speaker_num_speeches_withlimit.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) with open('speaker_char_count_withlimit.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) w = csv.writer(open("speaker_num_speeches_withlimit.csv", "w")) for key, val in speaker_num_speeches.items(): w.writerow([key, val]) w = csv.writer(open("speaker_char_count_withlimit.csv", "w")) for key, val in speaker_char_count.items(): w.writerow([key, val]) # Write the number of speeches and doc_frequency to memory for use in further analysis with open('num_speeches_noplein_withlimit.txt', 'w') as f: f.write('%d' % num_speeches) df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index") write_to_excel(df_doc_freq, 'doc_freq.xlsx') with open("bigram_doc_freq_noplein_withlimit.pickle", 'wb') as handle: pickle.dump(bigram_doc_freq, handle, protocol=0) # # Girondins = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)} # # Montagnards = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)} # with open("Girondins_withlimit.pickle", 'wb') as handle: # pickle.dump(Girondins, handle, protocol = 0) # with open("Montagnards_withlimit.pickle", 'wb') as handle: # pickle.dump(Montagnards, handle, protocol = 0) # gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) # mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) # """with open("gir_tfidf.pickle", 'wb') as handle: # pickle.dump(gir_tfidf, handle, protocol = 0) # with open("mont_tfidf.pickle", 'wb') as handle: # pickle.dump(mont_tfidf, handle, protocol = 0)""" # # Computes the distance between the tf_idf vectors # #compute_distance(gir_tfidf, mont_tfidf) # # Stores the tf_idf vectors # df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index") # #df_gir_tfidf.columns = ['Bigrams', 'tfidf'] # write_to_excel(df_gir_tfidf, 'gir_tfidf_withlimit.xlsx') # df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index") # #df_mont_tfidf.columns = ['Bigrams', 'tfidf'] # write_to_excel(df_mont_tfidf, 'mont_tfidf_withlimit.xlsx') # df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) # df_tfidf_combined = df_tfidf_combined.transpose() # df_tfidf_combined.columns = ["Girondins", "Montagnards"] # write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx') # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches # print gir_docs Girondins = {k: v for k, v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)} df_girondins = pd.DataFrame.from_dict(Girondins, orient="index") write_to_excel(df_girondins, "Girondins_counts_withlimit.xlsx") Montagnards = {k: v for k, v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)} df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index") write_to_excel(df_montagnards, "Montagnards_counts_withlimit.xlsx") gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) # # Normalizes the vectors and computes the distance between them # #normalized = normalize_dicts(Girondins, Montagnards) # #compute_distance(normalized[0], normalized[1]) # Stores the Girondins and Montagnards frequency vectors in the same document df_combined = pd.DataFrame([Girondins, Montagnards]) df_combined = df_combined.transpose() df_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_combined, 'combined_frequency_withlimit.xlsx') df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx')
def firststep(): year_month = [] full_date = [] speaker = [] ngrams = {} byyearmonth = pd.DataFrame() bydate = pd.DataFrame() byspeaker = pd.DataFrame() raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) dataframe = pd.DataFrame.from_dict(raw_speeches, orient = "index") dataframe.columns = ['Speeches'] speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb")) file = open('num_speeches.txt', 'r') num_speeches = int(file.read()) doc_freq = pickle.load(open("bigram_doc_freq.pickle", "rb")) for speechid in raw_speeches: speech_bigrams = compute_ngrams(raw_speeches[speechid], 2) ngrams[speechid] = speech_bigrams yearmonth = speechid[0:7] year_month.append(yearmonth) fulldate = speechid[0:10] full_date.append(fulldate) speaker.append(speechid_to_speaker[speechid]) dataframe['Year-Month'] = pd.Series(year_month).values dataframe['Full Date'] = pd.Series(full_date).values dataframe['Speaker'] = pd.Series(speaker).values dataframe['Speechid'] = dataframe.index write_to_excel(dataframe, "raw_data.xlsx") """with open("ngrams.pickle", "wb") as handle: pickle.dump(ngrams, handle, protocol = 0)""" """byyearmonth['YearMonth'] = pd.Series(year_month).values byyearmonth['ngrams'] = pd.Series(ngrams).values byyearmonth_dict = pd.Series(byyearmonth.ngrams.values, index = byyearmonth.YearMonth).to_dict() with open("byyearmonth_dict.pickle", 'wb') as handle: pickle.dump(byyearmonth_dict, handle, protocol = 0) bydate['FullDate'] = pd.Series(full_date).values bydate['ngrams'] = pd.Series(ngrams).values bydate_dict = pd.Series(bydate.ngrams.values, index = bydate.FullDate).to_dict() with open("bydate_dict.pickle", 'wb') as handle: pickle.dump(bydate_dict, handle, protocol = 0) byspeaker['Speaker'] = pd.Series(speaker).values byspeaker['ngrams'] = pd.Series(ngrams).values byspeaker_dict = pd.Series(byspeaker.ngrams.values, index = byspeaker.Speaker).to_dict() with open("byspeaker_dict.pickle", 'wb') as handle: pickle.dump(byspeaker_dict, handle, protocol = 0)""" # compute ngrams for each speech # don't need tfidf because should just add the frequency vectors not the tfidf ones # extract year-month # extract year-month-date # make all of those individual columns and create a pandas dataframe # create a function for each grouping and do a pandas groupby """byyearmonth = groupby_yearmonth(dataframe) write_to_excel(byyearmonth, "byyearmonth.xlsx") byyearmonth = None byspeaker = groupby_speaker(dataframe) write_to_excel(byspeaker, "byspeaker.xlsx") byspeaker = None bydate = groupby_date(dataframe) write_to_excel(bydate, "bydate.xlsx") bydate = None""" groupby_yearmonth(dataframe, ngrams) groupby_date(dataframe, ngrams) groupby_speaker(dataframe, ngrams)
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards, Plein): speaker_names = set() speaker_num_speeches = {} speaker_char_count = {} speakers_to_consider = [] bigrams_to_speeches = collections.defaultdict() bigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 plein_num_speeches = 0 gir_docs = {} mont_docs = {} plein_docs = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02"): # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker # To potentially establish a cutoff for analysis purposes speaker_name = speechid_to_speaker[identity] party = "" if speaker_name in speakers_to_consider: party = speakers_to_analyze.loc[speaker_name, "Party"] else: party = "Plein" augment(speaker_num_speeches, speaker_name) if speaker_name in speaker_char_count: speaker_char_count[speaker_name] += len(raw_speeches[identity]) else: speaker_char_count[speaker_name] = len(raw_speeches[identity]) indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) # Maintains a list of speeches in which given bigrams are spoken in if bigram in bigrams_to_speeches: bigrams_to_speeches[bigram].append(identity) else: bigrams_to_speeches[bigram] = [] bigrams_to_speeches[bigram].append(identity) # Augments the relevant variables according to the party the speaker belongs to if party == "Girondins": gir_num_speeches += 1 gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs) try: Girondins = Girondins + indv_speech_bigram except NameError: Girondins = indv_speech_bigram elif party == "Montagnards": mont_num_speeches += 1 mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs) try: Montagnards = Montagnards + indv_speech_bigram except NameError: Montagnards = indv_speech_bigram # Creates a Plein category that is neither Girondins or Montagnards to better understand speakers that are not distinctly one # or the other else: plein_num_speeches += 1 plein_docs = check_num_speakers(indv_speech_bigram, speaker_name, plein_docs) try: Plein = Plein + indv_speech_bigram except NameError: Plein = indv_speech_bigram #speech = speech + indv_speech_bigram # Stores the bigram Counter object for each individual speaker """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(speech, handle, protocol = 0)""" """# Stores the bigrams_to_speeches document in Excel df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index") write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')""" # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors num_speeches = gir_num_speeches + mont_num_speeches + plein_num_speeches print num_speeches with open('speaker_num_speeches_withplein.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) with open('speaker_char_count_withplein.pickle', 'wb') as handle: pickle.dump(speaker_num_speeches, handle, protocol=0) w = csv.writer(open("speaker_num_speeches_withplein.csv", "w")) for key, val in speaker_num_speeches.items(): w.writerow([key, val]) w = csv.writer(open("speaker_char_count_withplein.csv", "w")) for key, val in speaker_char_count.items(): w.writerow([key, val]) # Write the number of speeches and doc_frequency to memory for use in further analysis with open('num_speeches_withplein.txt', 'w') as f: f.write('%d' % num_speeches) df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index") write_to_excel(df_doc_freq, 'doc_freq.xlsx') with open("bigram_doc_freq_withplein.pickle", 'wb') as handle: pickle.dump(bigram_doc_freq, handle, protocol=0) with open("Girondins_withplein.pickle", 'wb') as handle: pickle.dump(Girondins, handle, protocol=0) with open("Montagnards_withplein.pickle", 'wb') as handle: pickle.dump(Montagnards, handle, protocol=0) with open("Plein.pickle", 'wb') as handle: pickle.dump(Plein, handle, protocol=0) gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) plein_tfidf = compute_tfidf(Plein, num_speeches, bigram_doc_freq) """with open("gir_tfidf.pickle", 'wb') as handle: pickle.dump(gir_tfidf, handle, protocol = 0) with open("mont_tfidf.pickle", 'wb') as handle: pickle.dump(mont_tfidf, handle, protocol = 0)""" # Computes the distance between the tf_idf vectors #compute_distance(gir_tfidf, mont_tfidf) # Stores the tf_idf vectors df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient="index") #df_gir_tfidf.columns = ['Bigrams', 'tfidf'] write_to_excel(df_gir_tfidf, 'gir_tfidf_withplein.xlsx') df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient="index") #df_mont_tfidf.columns = ['Bigrams', 'tfidf'] write_to_excel(df_mont_tfidf, 'mont_tfidf_withplein.xlsx') df_plein_tfidf = pd.DataFrame.from_dict(plein_tfidf, orient="index") #df_mont_tfidf.columns = ['Bigrams', 'tfidf'] write_to_excel(df_plein_tfidf, 'plein_tfidf.xlsx') df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf_withplein.xlsx') # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches Girondins = {k: v for k, v in Girondins.items() if (v >= 3)} #and (len(gir_docs[k]) > 1)} df_girondins = pd.DataFrame.from_dict(Girondins, orient="index") write_to_excel(df_girondins, "Girondins_counts_withplein.xlsx") Montagnards = {k: v for k, v in Montagnards.items() if (v >= 3)} #and (len(mont_docs[k]) > 1)} df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index") write_to_excel(df_montagnards, "Montagnards_counts_withplein.xlsx") # Normalizes the vectors and computes the distance between them #normalized = normalize_dicts(Girondins, Montagnards) #compute_distance(normalized[0], normalized[1]) # Stores the Girondins and Montagnards frequency vectors in the same document df_combined = pd.DataFrame([Girondins, Montagnards]) df_combined = df_combined.transpose() df_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_combined, 'combined_frequency.xlsx')
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards): speaker_names = set() speakers_to_consider = [] gir_num_speeches = 0 mont_num_speeches = 0 bigrams_speeches = collections.defaultdict() gir_doc_freq = collections.defaultdict() mont_doc_freq = collections.defaultdict() for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name == speechid_to_speaker[identity]): indv_speech_ngram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_ngram: if bigram in bigrams_speeches: bigrams_speeches[bigram].append(identity) else: bigrams_speeches[bigram] = [] bigrams_speeches[bigram].append(identity) if bigram in doc_freq: doc_freq[bigram] = doc_freq[bigram] + 1 else: doc_freq[bigram] = 1 if party == "Girondins": gir_num_speeches += 1 gir_doc_freq = check_num_speakers(indv_speech_ngram, speaker_name, gir_doc_freq) try: Girondins = Girondins + indv_speech_ngram except NameError: Girondins = indv_speech_ngram else: mont_num_speeches += 1 mont_doc_freq = check_num_speakers(indv_speech_ngram, speaker_name, mont_doc_freq) try: Montagnards = Montagnards + indv_speech_ngram except NameError: Montagnards = indv_speech_ngram speech = speech + indv_speech_ngram #speaker_ngrams = compute_ngrams(speech, 2) pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(speech, handle, protocol=0) with open('bigrams_to_speeches.csv', 'wb') as outfile: writer = csv.writer(outfile) for key, val in bigrams_speeches.items(): writer.writerow([key, val]) # Creates the combined frequency document when not limiting more than three occurrences for purposes of use # when creating the frequency map print_to_excel(Girondins, Montagnards, 'combined_frequency_all.xlsx') Girondins = {k: v for k, v in Girondins.items() if (v >= 3)} #and (len(gir_doc_freq[k]) > 1)} print_to_csv(Girondins, "Girondins_counts.csv") Montagnards = {k: v for k, v in Montagnards.items() if (v >= 3)} #and (len(mont_doc_freq[k]) > 1)} print_to_csv(Montagnards, "Montagnards_counts.csv") print_to_excel(Girondins, Montagnards, 'combined_frequency.xlsx') num_speeches = gir_num_speeches + mont_num_speeches gir_tfidf = compute_tfidf(Girondins, num_speeches) mont_tfidf = compute_tfidf(Montagnards, num_speeches) #compute_distance(gir_tfidf, mont_tfidf) print_to_csv(gir_tfidf, 'gir_tfidf.csv') print_to_csv(mont_tfidf, 'mont_tfidf.csv') print_to_excel(gir_tfidf, mont_tfidf, 'combined_tfidf.xlsx') normalized = normalize_dicts(Girondins, Montagnards) compute_distance(normalized[0], normalized[1])
def build_vectors(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, gir_tfidf, mont_tfidf, num_speeches, doc_freq): speaker_ngrams = {} speakers_to_consider = [] speaker_distances = collections.defaultdict() chronology = collections.defaultdict(dict) for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] speaker_name = speechid_to_speaker[identity] if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name in speakers_to_consider): indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) if speaker_name in speaker_ngrams: speaker_ngrams[speaker_name] = speaker_ngrams[speaker_name] + indv_speech_bigram else: speaker_ngrams[speaker_name] = indv_speech_bigram """ if speaker_name in chronology: pairing = chronology[speaker_name] for bigram in indv_speech_bigram: if bigram in pairing: pairing[bigram].append([identity, indv_speech_bigram[bigram]]) else: pairing[bigram] = [identity, indv_speech_bigram[bigram]] else: chronology[speaker_name] = {} pairing = chronology[speaker_name] for bigram in indv_speech_bigram: pairing[bigram] = [] # stores the unique speechid alongside the number of times that bigram is said in that speech for each bigram pairing[bigram] = [identity, indv_speech_bigram[bigram]]""" ## Need tf-idf vectors for gir and mont ## Need the doc_freq for the previous calcuations ## compute tf-idf for individual speakers ## compute cosine distance based on those vectors (dot product over length of vectors) ## compute cosine similarity between the difference between the two group vectors (subtract from each other) ## A - B, if positive more like A, if negative more like B ## create tf vector for each speech and store that so can just add ## Separately store single idf vector ######### gir_dict = convert_keys_to_string(gir_tfidf) mont_dict = convert_keys_to_string(mont_tfidf) doc_freq_dict = convert_keys_to_string(doc_freq) gir_mont_diff = compute_difference(gir_dict, mont_dict) #gir_dict = gir_tfidf #print gir_dict #mont_dict = mont_tfidf for speaker in speaker_ngrams: speaker_dict = convert_keys_to_string(speaker_ngrams[speaker]) to_compare = compute_tfidf(speaker_dict, num_speeches, doc_freq_dict) gir_dist = cosine_similarity(gir_dict, to_compare) mont_dist = cosine_similarity(mont_dict, to_compare) # Need to actually compute the distance gir_mont_diff_dist = cosine_similarity(gir_mont_diff, to_compare) speaker_distances[speaker] = [gir_dist, mont_dist, gir_mont_diff_dist] """ #speaker_dict = {(str(k),v) for k,v in speaker_ngrams['Francois Chabot']} speaker_dict = convert_keys_to_string(speaker_ngrams['Francois Chabot']) to_compare = compute_tfidf(speaker_dict, num_speeches, doc_freq) gir_dist = cosine_similarity(gir_dict, to_compare) df = pd.DataFrame([to_compare, gir_dict]) df = df.transpose() write_to_excel(df, "Francois Chabot Test.xlsx")""" """for speaker in speaker_ngrams: #to_compare = {k:v for k,v in speaker_ngrams[speaker].items() if (v >= 3)} to_compare = speaker_ngrams[speaker] gir_dict = gir_tfidf mont_dict = mont_tfidf gir_normalized = normalize_dicts(to_compare, gir_dict) gir_dist = compute_distance(gir_normalized[0], gir_normalized[1]) to_compare = speaker_ngrams[speaker] mont_normalized = normalize_dicts(to_compare, mont_dict) mont_dist = compute_distance(mont_normalized[0], mont_normalized[1]) speaker_distances[speaker] = [gir_dist, mont_dist]""" pickle_filename_3 = "speaker_ngrams.pickle" with open(pickle_filename_3, 'wb') as handle: pickle.dump(speaker_ngrams, handle, protocol = 0) df = pd.DataFrame.from_dict(speaker_distances) df = df.transpose() df.columns = ["dist to Girondins", "dist to Montagnards", "dist to difference"] filename = "freq_dist_map.xlsx" writer = pd.ExcelWriter(filename) df.to_excel(writer, 'Sheet1') writer.save() pickle_filename = "freq_dist.pickle" with open(pickle_filename, 'wb') as handle: pickle.dump(speaker_distances, handle, protocol = 0) """df2 = pd.DataFrame.from_dict(chronology)