def make_visualizations(chronology_date): num_per_bigram_per_date = chronology_date.groupby(["Bigram", "Date"]).agg( {"Num occurrences": "sum"}) store_to_pickle(num_per_bigram_per_date, "num_per_bigram_per_date.pickle") # num_bigram_date = pickle.load(open("num_per_bigram_date.pickle","rb")) grouped = chronology_date.groupby(["Bigram"]) store_to_pickle(grouped, "grouped.pickle")
def calculate_chronology(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, Girondins, Montagnards): speaker_ngrams = {} speakers_to_consider = [] speaker_distances = collections.defaultdict() chronology = collections.defaultdict(dict) for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) row_entry_speechid = [] row_entry_date = [] for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] speaker_name = speechid_to_speaker[identity] if (date >= "1792-09-20") and (date <= "1793-06-02") and ( speaker_name in speakers_to_consider): party = speakers_to_analyze.loc[speaker_name, "Party"] indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) # Store relevant information for each bigram for bigram in indv_speech_bigram: row_entry_speechid.append([ str(bigram), speaker_name, identity, indv_speech_bigram[bigram], party ]) row_entry_date.append([ str(bigram), speaker_name, date, indv_speech_bigram[bigram], party ]) chronology_speechid = pd.DataFrame(row_entry_speechid, columns=[ "Bigram", "Speaker Name", "Speechid", "Num occurrences", "Party" ]) chronology_date = pd.DataFrame( row_entry_date, columns=["Bigram", "Speaker Name", "Date", "Num occurrences", "Party"]) # w = csv.writer(open("chronology.csv", "w")) # for key, val in chronology.items(): # if (Girondins[key] >= 10) or (Montagnards[key] >= 10): # w.writerow([key,val]) make_visualizations(chronology_date) write_to_excel(chronology_speechid, "chronology_speechid.xlsx") write_to_excel(chronology_date, "chronology_date.xlsx") store_to_pickle(chronology_speechid, "chronology_speechid.pickle") store_to_pickle(chronology_date, "chronology_date.pickle")
def parseFiles(raw_speeches, multiple_speakers): # Assumes all xml files are stored in a Docs folder in the same directory as the python file files = os.listdir("AP_ARTFL_vols/") dates = set() num_sessions = 0 num_morethan1_session = 0 for filename in files: if filename.endswith(".xml"): print(filename) filename = open('AP_ARTFL_vols/' + filename, "r") # Extracts volume number to keep track of for names_not_caught and speakers_using_find volno = re.findall(vol_regex, str(filename))[0] contents = filename.read() soup = BeautifulSoup(contents, 'lxml') pages = re.findall(page_regex, contents) # Find all the sessions in the xml sessions = soup.find_all(['div3'], {"type": ["other"]}) sessions_other = soup.find_all(['div2'], {"type": ["session"]}) sessions = sessions + sessions_other # sessions = soup.find_all(['div2', 'div3'], {"type": ["session", "other"]}) for session in sessions: date = extractDate(session) # Restricts to valid dates we want to look at if (date >= "1789-05-05") and (date <= "1795-01-04") and ( date != "error"): # Datas is a dataset keeping track of dates already looked at # Accounts for multiple sessions per day num_sessions += 1 if date in dates: num_morethan1_session += 1 date = date + "_soir" if date in dates: date = date + "2" findSpeeches(raw_speeches, multiple_speakers, session, date, volno) else: findSpeeches(raw_speeches, multiple_speakers, session, date, volno) dates.add(date) else: findSpeeches(raw_speeches, multiple_speakers, session, date, volno) dates.add(date) filename.close() store_to_pickle(num_sessions, "num_sessions.pickle") store_to_pickle(num_morethan1_session, "num_morethan1_session.pickle")
def aggregate_by_speaker(): byspeaker = {} speakerdict = {} ngrams = {} speakers_to_consider = [] raw_speeches = pickle.load(open("raw_speeches.pickle", "rb")) speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb")) speakers_to_analyze = load_list( "Girondins and Montagnards New Mod Limit.xlsx") speaker_num_words = {} for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) for speechid in raw_speeches: fulldate = speechid[0:10] if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"): num_words = len(raw_speeches[speechid].split()) speech_bigrams = compute_ngrams(raw_speeches[speechid], 2) speaker = speechid_to_speaker[speechid] if speaker in speaker_num_words: speaker_num_words[speaker] += num_words else: speaker_num_words[speaker] = num_words if speaker in speakers_to_consider: if speaker in byspeaker: byspeaker[speaker] = byspeaker[speaker] + speech_bigrams else: byspeaker[speaker] = speech_bigrams speech_bigrams = None write_to_csv(byspeaker) store_to_pickle(byspeaker) write_to_csv(speaker_num_words) store_to_pickle(speaker_num_words)
def counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq): # Computes the tfidf scores within each group gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) store_to_pickle(gir_tfidf, "gir_tfidf.pickle") store_to_pickle(mont_tfidf, "mont_tfidf.pickle") # Stores the tf_idf vectors in Excel df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index") write_to_excel(df_gir_tfidf, 'gir_tfidf.xlsx') df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index") write_to_excel(df_mont_tfidf, 'mont_tfidf.xlsx') # Combines the tfidf vectors of both parties into one file df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf.xlsx') # Limits based on v, or the number of times that bigram appears, and gir or mont docs, the number of # speakers in each group that use that bigram # Can change the name of these dataframes to illuminate what the restrictions are Girondins_restricted = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)} Montagnards_restricted = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)} store_to_pickle(Girondins_restricted, "Girondins_restricted.pickle") store_to_pickle(Montagnards_restricted, "Montagnards_restricted.pickle") gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq) mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq) # Stores the Girondins and Montagnards frequency vectors and tfidfs in the same document according to restrictions df_combined = pd.DataFrame([Girondins, Montagnards]) df_combined = df_combined.transpose() df_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_combined, 'combined_frequency_restricted.xlsx') df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf]) df_tfidf_combined = df_tfidf_combined.transpose() df_tfidf_combined.columns = ["Girondins", "Montagnards"] write_to_excel(df_tfidf_combined, 'combined_tfidf_restricted.xlsx')
year = session[0:4] if year in num_speeches_per_year: num_speeches_per_year[year] = num_speeches_per_year[year] + speeches_per_session[session] else: num_speeches_per_year[year] = speeches_per_session[session] if year in count_sessions: count_sessions[year] = count_sessions[year] + 1 else: count_sessions[year] = 1 avg_num_speeches_per_session_per_year = {} for year in num_speeches_per_year: avg_num_speeches_per_session_per_year[year] = num_speeches_per_year[year]/(1.0*count_sessions[year]) store_to_pickle(num_speeches_per_year, "num_speeches_per_year.pickle") store_to_pickle(count_sessions, "count_sessions.pickle") write_to_csv(num_speeches_per_year, "num_speeches_per_year.csv") write_to_csv(avg_num_speeches_per_session_per_year, "avg_num_speeches_per_session_per_year.csv") write_to_csv(count_sessions, "count_sessions.csv") # num speakers per year speakers_per_year = {} for speechid in speechid_to_speaker: year = speechid[0:4] speaker = speechid_to_speaker[speechid] if year in speakers_per_year: speakers_per_year[year].add(speaker) else:
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards): speaker_num_speeches = {} speaker_char_count = {} # Dataframe to keep track of the speakers we care about speakers_to_consider = [] # Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches # and speechid_to_speaker for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) # Matches bigrams to the list of speakers and speeches that have that bigram bigrams_to_speeches = {} bigrams_to_speakers = {} # Maintains the number of documents a given bigram is spoken in for use with tf-idf bigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]): # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker # To potentially establish a cutoff for analysis purposes augment(speaker_num_speeches, speaker_name) if speaker_name in speaker_char_count: speaker_char_count[speaker_name] += len(raw_speeches[identity]) else: speaker_char_count[speaker_name] = len(raw_speeches[identity]) indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) # Maintains a list of speeches in which given bigrams are spoken in if bigram in bigrams_to_speeches: bigrams_to_speeches[bigram].append(identity) else: bigrams_to_speeches[bigram] = [] bigrams_to_speeches[bigram].append(identity) if bigram in bigrams_to_speakers: bigrams_to_speakers[bigram].add(speaker_name) else: bigrams_to_speakers[bigram] = set() bigrams_to_speakers[bigram].add(speaker_name) # Augments the relevant variables according to the party the speaker belongs to if party == "Girondins": gir_num_speeches += 1 gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs) try: Girondins = Girondins + indv_speech_bigram except NameError: Girondins = indv_speech_bigram else: mont_num_speeches += 1 mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs) try: Montagnards = Montagnards + indv_speech_bigram except NameError: Montagnards = indv_speech_bigram ### Maintains a Counter of all the bigrams and their counts for a given speaker # speech = speech + indv_speech_bigram ### Stores the bigram Counter object for each individual speaker # pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle" # with open(pickle_filename, 'wb') as handle: # pickle.dump(speech, handle, protocol = 0) # Store raw counts store_to_pickle(Girondins,"Girondins.pickle") store_to_pickle(Montagnards, "Montagnards.pickle") # Store in memory aggregate information about each bigram bigram_aggregate_info(Girondins, Montagnards, bigrams_to_speakers, bigrams_to_speeches) ### If data has already been stored to memory, the lines below can be used # bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb")) # bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb")) # gir_docs = pickle.load(open("gir_docs.pickle", "rb")) # mont_docs = pickle.load(open("mont_docs.pickle", "rb")) # Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb")) # Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb")) # bigram_doc_freq = pickle.load(open("bigram_doc_freq.pickle", 'rb')) num_speeches = 4479 # Computes counts and tfidf scores for each party and outputs for further analysis in R counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq) """ EVERYTHING BELOW IS STORING DATA TO MEMORY """ # Stores the bigrams_to_speeches document in Excel df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index") write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx') df_bigrams_to_speakers = pd.DataFrame.from_dict(bigrams_to_speakers, orient = "index") write_to_excel(df_bigrams_to_speakers, 'bigrams_to_speakers.xlsx') df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient = "index") write_to_excel(df_doc_freq, 'doc_freq.xlsx') # Stores files in memory store_to_pickle(bigrams_to_speakers, "bigrams_to_speakers.pickle") store_to_pickle(bigrams_to_speeches, "bigrams_to_speeches.pickle") store_to_pickle(gir_docs, "gir_docs.pickle") store_to_pickle(mont_docs, "mont_docs.pickle") store_to_pickle(speaker_num_speeches, "speaker_num_speeches.pickle") store_to_pickle(speaker_char_count, "speaker_char_count.pickle") store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle") with open('gir_speeches.txt', 'w') as f: f.write('%d' % gir_num_speeches) with open('mont_speeches.txt', 'w') as f: f.write('%d' % mont_num_speeches) write_to_csv(speaker_num_speeches, "speaker_num_speeches.csv") write_to_csv(speaker_char_count, "speaker_char_count.csv") with open('num_speeches.txt', 'w') as f: f.write('%d' % num_speeches)
# txtfile = open("names_not_caught.txt", 'w') # for name in sorted(names_not_caught): # txtfile.write(name) # txtfile.close() # file = open('speakers_using_find.txt', 'w') # for item in sorted(speakers_using_find): # file.write(item) # file.close() file = open('speakers.txt', 'w') for item in sorted(speakers): file.write(item + "\n") file.close() store_to_pickle(speechid_to_speaker, "speechid_to_speaker.pickle") store_to_pickle(raw_speeches, "raw_speeches.pickle") store_to_pickle(multiple_speakers, "multiple_speakers.pickle") store_to_pickle(speaker_num_total_speeches, "speaker_num_total_speeches.pickle") store_to_pickle(speaker_num_total_chars, "speaker_num_total_chars.pickle") store_to_pickle(speakers, "speakers.pickle") store_to_pickle(speeches_per_day, "speeches_per_session.pickle") store_to_pickle(speakers_per_session, "speakers_per_session.pickle") write_to_csv(speechid_to_speaker, "speechid_to_speaker.csv") write_to_csv(raw_speeches, "raw_speeches.csv") write_to_csv(speaker_num_total_speeches, "speaker_num_total_speeches.csv") write_to_csv(speaker_num_total_chars, "speaker_num_total_chars.csv") write_to_csv(speeches_per_day, "speeches_per_day.csv")
def checkErrors(enc_words, french_stopwords): files = os.listdir("AP_ARTFL_vols/") errors_per_vol = {} errors_per_page = {} word_freq_wrong = {} for filename in files: if filename.endswith(".xml"): filename = open('AP_ARTFL_vols/' + filename, "r") volno = re.findall(vol_regex, str(filename))[0] print volno contents = filename.read() soup = BeautifulSoup(contents, 'lxml') num_errors = 0 num_words_vol = 0 word_freq = {} # Iterate through contents and find all page tags pb_tags = [] last_index = 0 while True: loc = contents.find("<pb n=", last_index) if loc == -1: break pb_tags.append(loc) last_index = loc + 1 # Iterates through all page tags and looks through the contents on each page, checking each word against the # words contained in the Encyclodpedie for i in range(0, len(pb_tags) - 1): contents_substr = contents[pb_tags[i]:pb_tags[i + 1]] page_num = BeautifulSoup(contents_substr, 'lxml').find_all('pb') pb_soup = BeautifulSoup(contents_substr, 'lxml') pageno = volno + "_pg" + page_num[0].get("n") error_per_page = 0 num_words_pg = 0 text = unicode(contents_substr, "ascii", errors="ignore") text = remove_diacritic(text).decode('utf-8') paragraph = remove_stopwords(text, french_stopwords) paragraph = paragraph.replace("\n", " ").replace( ")", "").replace("*", "").replace(":", "").replace( "-", "").replace("_", "").replace("(", "").replace( "& ", "").replace("; ", "").replace(".", "").replace( ",", "").replace("?", "").replace("!", "") paragraph = re.sub(r'([0-9]{1,4})', ' ', paragraph) words = paragraph.split(" ") num_words_vol += len(words) num_words_pg += len(words) for word in words: if word not in enc_words: if word in word_freq: word_freq[word] += 1 else: word_freq[word] = 1 error_per_page += 1 num_errors += 1 errors_per_page[pageno] = [error_per_page, num_words_pg] word_freq_wrong[volno] = sorted(word_freq.items(), key=lambda kv: kv[1]) errors_per_vol[volno] = [num_errors, num_words_vol] # Save and output errors per volume store_to_pickle(errors_per_vol, "errors_per_vol.pickle") w = csv.writer(open("errors_per_vol.csv", "w")) for key, val in errors_per_vol.items(): if isinstance(key, str): key = unicode(key, "ascii", errors="ignore") w.writerow([key, val[0], val[1]]) # Save and output errors per page store_to_pickle(error_per_page, "errors_per_page.pickle") w = csv.writer(open("errors_per_page.csv", "w")) for key, val in errors_per_page.items(): if isinstance(key, str): key = unicode(key, "ascii", errors="ignore") w.writerow([key.encode("utf-8", errors="ignore"), val[0], val[1]]) # Save and output frequency of errors per word per volume store_to_pickle(word_freq_wrong, "word_freq_errors.pickle") w = csv.writer(open("word_freq_errors.csv", "w")) for key, val in word_freq_wrong.items(): w.writerow([key, val])
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker): # Dataframe to keep track of the speakers we care about speakers_to_consider = [] # Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches # and speechid_to_speaker for speaker in speakers_to_analyze.index.values: speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8')) # Initialize various data frames for export to the classification script train_total_freq_unigram = {} test_total_freq_unigram = {} train_total_freq_bigram = {} test_total_freq_bigram = {} train_number_speeches = 0 test_number_speeches = 0 # Keeps track of which speeches contain the given bigram train_speeches_bigram = collections.defaultdict(dict) test_speeches_bigram = collections.defaultdict(dict) train_speeches_unigram = collections.defaultdict(dict) test_speeches_unigram = collections.defaultdict(dict) bigrams_to_speeches = collections.defaultdict() bigram_doc_freq = collections.defaultdict() unigram_doc_freq = collections.defaultdict() gir_num_speeches = 0 mont_num_speeches = 0 gir_docs = {} mont_docs = {} for speaker_name in speakers_to_consider: print speaker_name party = speakers_to_analyze.loc[speaker_name, "Party"] speech = Counter() # Variable to keep track of a given speaker's number of speeches speech_num = 0 for identity in raw_speeches: date = re.findall(date_regex, str(identity))[0] # Only look at speeches within the date frame and that are from the speaker of interest if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]): # Only looking at speeches with substance, so greater than 100 characters if len(raw_speeches[identity]) >= 100: indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2) indv_speech_unigram = compute_ngrams(raw_speeches[identity], 1) # Splitting the data into training and test data with 1/4 of each speaker's data in the test set if speech_num%4 != 0: train_number_speeches += 1 for bigram in indv_speech_bigram: augment(bigram_doc_freq, bigram) augment(train_total_freq_bigram, bigram) for unigram in indv_speech_unigram: augment(unigram_doc_freq, unigram) augment(train_total_freq_unigram, unigram) train_speeches_bigram[identity] = indv_speech_bigram train_speeches_unigram[identity] = indv_speech_unigram else: test_number_speeches += 1 for bigram in indv_speech_bigram: augment(test_total_freq_bigram, bigram) for unigram in indv_speech_unigram: augment(test_total_freq_unigram, unigram) test_speeches_bigram[identity] = indv_speech_bigram test_speeches_unigram[identity] = indv_speech_unigram speech_num += 1 # Write all relevant data objects and values to memory to use when running classification store_to_pickle(speakers_to_analyze, "speakers_to_analyze.pickle") # Set these dataframes to None to conserve memory speakers_to_analyze = None speechid_to_speaker = None raw_speeches = None store_to_pickle(train_speeches_bigram, "train_speeches_bigram.pickle") store_to_pickle(train_speeches_unigram, "train_speeches_unigram.pickle") store_to_pickle(train_total_freq_bigram, "train_total_freq_bigram.pickle") store_to_pickle(train_total_freq_unigram, "train_total_freq_unigram.pickle") store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle") store_to_pickle(unigram_doc_freq, "unigram_doc_freq.pickle") store_to_pickle(train_number_speeches, "train_number_speeches.pickle") store_to_pickle(test_speeches_bigram, "test_speeches_bigram.pickle") store_to_pickle(test_speeches_unigram, "test_speeches_unigram.pickle") store_to_pickle(test_total_freq_bigram, "test_total_freq_bigram.pickle") store_to_pickle(test_total_freq_unigram, "test_total_freq_unigram.pickle")