def findSpeeches(raw_speeches, multiple_speakers, daily_soup, date, volno):
	id_base = date.replace("/","_")
	number_of_speeches = 0
	presidents = [">le President", "Le President", "Mle President", "President", "le' President", "Le Preesident", "Le Preseident", "Le Presidant", "Le Presideait", "le Presiden", "le President", "Le president", "le president", "Le President,", "Le Presideut", "Le Presidtent", "le Presient", "le Presldent", "le'President"]
	for talk in daily_soup.find_all('sp'):
		# Tries to extract the speaker name and edits it for easier pairing with the Excel file
		try:
			speaker = talk.find('speaker').get_text()
			speaker = remove_diacritic(speaker).decode('utf-8')
			speaker = speaker.replace("M.","").replace("MM ", "").replace("MM. ","").replace("M ", "").replace("de ","").replace("M. ","").replace("M, ","").replace("M- ","").replace("M; ","").replace("M* ","").replace(".","").replace(":","").replace("-", " ")
			if speaker.endswith(","):
				speaker = speaker[:-1]
			if speaker.endswith(", "):
				speaker = speaker[:-1]
			if speaker.startswith(' M. '):
				speaker = speaker[3:]
			if speaker.startswith(' '):
				speaker = speaker[1:]
			if speaker.endswith(' '):
				speaker = speaker[:-1]
		except AttributeError:
			speaker = ""

		speaker = speaker.lower()

		# Removes the footnotes
		speech_id = "" + id_base + "_" + str(number_of_speeches + 1)
		while talk.find("note"):
			ftnotes = talk.note.extract()
			ftnotes = remove_diacritic(ftnotes.get_text()).decode('utf-8')
			ftnotes = ftnotes.replace("\n","").replace("\r","").replace("\t","").replace("  "," ")
			footnotes.append([ftnotes, speaker, speech_id, volno])
		number_of_speeches += 1
def parseFiles():
    wrong_dates = set()
    files = os.listdir("Docs/")
    for filename in files:
        if filename.endswith(".xml"):
            print(filename)
            filename = open('Docs/' + filename, "r")
            volno = re.findall(vol_regex, str(filename))[0]
            contents = filename.read()
            soup = BeautifulSoup(contents, 'lxml')
            # A search for date tags that contain a valid value
            dates = soup.find_all('date')
            for date in dates:
                if date.attrs:
                    coded_date = date['value']
                    year, month, day = re.findall(date_regex, coded_date)[0]
                    child = date.findChildren()
                    if child:
                        child[0].extract()
                    text_date = date.get_text()
                    text_date = re.sub(r'([ ]{2,})', ' ', text_date)
                    text_date = remove_diacritic(text_date).decode('utf-8')
                    text_date = text_date.lower().replace('\n', '')
                    # Various checks perfomed to see if the textual date matches the encoded date or is valid at all
                    try:
                        text_day, text_month, text_year = re.findall(
                            text_regex, text_date)[0]
                        text_month = text_month.replace(' (sic)', '').replace(
                            '\n', '').replace('\r', '').replace(' ', '')
                        text_date = remove_diacritic(text_month).decode(
                            'utf-8')
                        text_month = re.sub(r'([ ]{2,})', ' ', text_month)
                    except:
                        wrong_dates.add(coded_date + "; " +
                                        str(date.contents) + "; " +
                                        str(volno) + "\n")
                    try:
                        month_num = month_to_num[text_month]
                    except:
                        wrong_dates.add(coded_date + "; " +
                                        str(date.contents) + "; " +
                                        str(volno) + "\n")
                    if (month_num != str(month)):
                        wrong_dates.add(coded_date + "; " +
                                        str(date.contents) + "; " +
                                        str(volno) + "\n")
            filename.close()

    # Write the wrong dates to a file
    file = open('wrong_dates.txt', 'w')
    for item in sorted(wrong_dates):
        file.write(item)
    file.close()
Ejemplo n.º 3
0
def read_names(name_file):
	# pd_list = pd.read_excel("an_names.xls")
	pd_list = pd.read_excel(name_file)
	pd_list = pd_list.set_index('Last Name')
	speakers = pd_list.index.tolist()
	for speaker in speakers:
		ind = speakers.index(speaker)
		speakers[ind] = remove_diacritic(speaker).decode('utf-8').lower()
	pd_list.index = speakers
	full_names = []
	for full_name in pd_list["Full Name"]:
		full_names.append(remove_diacritic(full_name).decode('utf-8').lower())
	pd_list["Full Name"] = full_names
	return pd_list
Ejemplo n.º 4
0
def parseEnc():
    # Assumes all xml files are stored in a docs folder in the same directory as the python file
    files = os.listdir("Encyclopedie/")
    words = set()
    for filename in files:
        if filename.endswith(".tei"):
            print(filename)
            filename = open('Encyclopedie/' + filename, "r")
            contents = filename.read()
            soup = BeautifulSoup(contents, 'lxml')

            paragraphs = soup.find_all('p')
            for para in paragraphs:
                if para.find("i"):
                    para.i.extract()
                if para.find("sc"):
                    para.sc.extract()
                if para.find("note"):
                    para.note.extract()
                para = para.get_text()
                para = para.replace("\n", " ").replace("& ", "").replace(
                    "; ", "").replace(".", "").replace(",", "").replace(
                        "?", "").replace("!", "").replace("  ", " ")
                paragraph = remove_diacritic(para).decode('utf-8')
                para = para.lower()
                paragraph = paragraph.split(" ")
                words = words.union(paragraph)
    return words
Ejemplo n.º 5
0
def clean(just_speech):
    stopwords_from_file = open('FrenchStopwords.txt', 'r')
    lines = stopwords_from_file.readlines()
    french_stopwords = []
    for line in lines:
        word = line.split(',')
        #remove returns and new lines at the end of stop words so the parser catches matches
        #also remove accents so the entire analysis is done without accents
        word_to_append = remove_diacritic(
            unicode(word[0].replace("\n", "").replace("\r", ""), 'utf-8'))
        french_stopwords.append(word_to_append)

    just_speech = just_speech.replace("%", " ").replace("\\", " ").replace(
        "^",
        " ").replace("=", " ").replace("]", " ").replace("\"", " ").replace(
            "``",
            " ").replace("-", " ").replace("[", " ").replace("{", " ").replace(
                "$", " ").replace("~", " ").replace("-", " ").replace(
                    "}", " ").replace("&", " ").replace(">", " ").replace(
                        "#", " ").replace("/", " ").replace("\`", " ").replace(
                            "'", " ").replace("*", " ").replace(
                                "`", " ").replace(";", " ").replace(
                                    "?", " ").replace(",", " ").replace(
                                        ":", " ").replace(".", " ").replace(
                                            "(", " ").replace(")", " ")
    clean_text = remove_stopwords(just_speech.lower(), french_stopwords)
    clean_text = clean_text.replace("marat", " ").replace("accusation", " ")
    return clean_text
Ejemplo n.º 6
0
def parseFile():
    votes = {}
    justifications = []
    votes_model2 = {}
    file = open('marat.xml', "r")
    contents = file.read()
    contents = re.sub(
        r'(<p>(?:DÉPARTEMENT|DEPARTEMENT|DÉPARTEMENE)[\s\S]{1,35}<\/p>)', '',
        contents)
    soup = BeautifulSoup(contents, 'lxml')
    # Look at all speaker tags in the XML
    for talk in soup.find_all('sp'):
        speaker = talk.find('speaker').get_text()
        speaker = remove_diacritic(speaker).decode('utf-8')
        speaker = speaker.replace(".", "")

        # Find all the text by looking at paragraph tags
        speech = talk.find_all('p')
        text = ""
        full_speech = ""
        for section in speech:
            text = text + section.get_text()
        full_speech = remove_diacritic(text).decode('utf-8')
        full_speech = full_speech.replace('\n',
                                          '').replace('\t',
                                                      '').replace('\r', '')
        full_speech = re.sub(r'([ ]{2,})', ' ', full_speech)

        ### Both of the following if statements are for topic modeling but are used for different approaches to the topic modeling
        # Restrict to justifications longer than 30 characters for purposes of topic modeling
        if len(full_speech) > 30:
            justifications.append(full_speech)

        votes[speaker] = full_speech

        if len(full_speech) > 30:
            votes_model2[speaker] = full_speech

    # Two topic model functions
    runTopicModel(justifications)
    #topicModel(votes_model2)

    df = pd.DataFrame.from_dict(votes, orient='index')
    writer = pd.ExcelWriter('Marat_Justifications.xlsx')
    df.to_excel(writer)
    writer.save()
    file.close()
Ejemplo n.º 7
0
def read_names_file(name_file):
    pd_list = pd.read_excel(name_file)
    pd_list = pd_list.set_index('Full Name')
    speakers = pd_list.index.tolist()
    for speaker in speakers:
        ind = speakers.index(speaker)
        speakers[ind] = remove_diacritic(speaker).decode('utf-8').lower()
    pd_list.index = speakers
    return pd_list
Ejemplo n.º 8
0
def checkErrors(enc_words, french_stopwords):
    files = os.listdir("AP_ARTFL_vols/")
    errors_per_vol = {}
    errors_per_page = {}

    for filename in files:
        if filename.endswith(".xml"):
            filename = open('AP_ARTFL_vols/' + filename, "r")
            volno = re.findall(vol_regex, str(filename))[0]
            contents = filename.read()
            soup = BeautifulSoup(contents, 'lxml')

            num_errors = 0

            pages = re.findall(
                r'<pb n="[\s0-9]+" facs="[\s\S]{0,300}" \/> [\s\S]{0,10000} <pb',
                contents)
            for page in pages:
                page_num = BeautifulSoup(page, 'lxml').find_all('pb')
                pageno = volno + "_pg" + page_num[0].get("n")
                error_per_page = 0
                paragraphs = soup.find_all('p')
                for para in paragraphs:
                    if para.find("note"):
                        para.note.extract()
                    para = para.get_text().lower()
                    para = remove_diacritic(para).decode('utf-8')
                    para = para.replace("'", " ")
                    paragraph = remove_stopwords(para, french_stopwords)
                    # para = para.replace("s'","").replace("l'","").replace("d'","")
                    paragraph = paragraph.replace("\n", " ").replace(
                        ")", "").replace("*", "").replace(":", "").replace(
                            "-", "").replace("_", "").replace("(", "").replace(
                                "& ",
                                "").replace("; ", "").replace(".", "").replace(
                                    ",", "").replace("?", "").replace("!", "")
                    paragraph = re.sub(r'([0-9]{1,4})', ' ', paragraph)
                    words = paragraph.split(" ")
                    for word in words:
                        if word not in enc_words:
                            print word
                            error_per_page += 1
                            num_errors += 1
                errors_per_page[pageno] = error_per_page

            errors_per_vol[volno] = num_errors
    with open("errors_per_vol.pickle", 'wb') as handle:
        pickle.dump(errors_per_vol, handle, protocol=0)
    w = csv.writer(open("errors_per_vol.csv", "w"))
    for key, val in errors_per_vol.items():
        w.writerow([key, val])

    with open("errors_per_page.pickle", 'wb') as handle:
        pickle.dump(errors_per_page, handle, protocol=0)
    w = csv.writer(open("errors_per_page.csv", "w"))
    for key, val in errors_per_page.items():
        w.writerow([key, val])
Ejemplo n.º 9
0
def calculate_chronology(raw_speeches, speechid_to_speaker, speaker_list,
                         speakers_to_analyze, Girondins, Montagnards):
    speaker_ngrams = {}
    speakers_to_consider = []
    speaker_distances = collections.defaultdict()
    chronology = collections.defaultdict(dict)

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    row_entry_speechid = []
    row_entry_date = []
    for identity in raw_speeches:
        date = re.findall(date_regex, str(identity))[0]
        speaker_name = speechid_to_speaker[identity]
        if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                speaker_name in speakers_to_consider):
            party = speakers_to_analyze.loc[speaker_name, "Party"]
            indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
            # Store relevant information for each bigram
            for bigram in indv_speech_bigram:
                row_entry_speechid.append([
                    str(bigram), speaker_name, identity,
                    indv_speech_bigram[bigram], party
                ])
                row_entry_date.append([
                    str(bigram), speaker_name, date,
                    indv_speech_bigram[bigram], party
                ])

    chronology_speechid = pd.DataFrame(row_entry_speechid,
                                       columns=[
                                           "Bigram", "Speaker Name",
                                           "Speechid", "Num occurrences",
                                           "Party"
                                       ])
    chronology_date = pd.DataFrame(
        row_entry_date,
        columns=["Bigram", "Speaker Name", "Date", "Num occurrences", "Party"])

    # w = csv.writer(open("chronology.csv", "w"))
    # for key, val in chronology.items():
    # 	if (Girondins[key] >= 10) or (Montagnards[key] >= 10):
    # 		w.writerow([key,val])
    make_visualizations(chronology_date)

    write_to_excel(chronology_speechid, "chronology_speechid.xlsx")
    write_to_excel(chronology_date, "chronology_date.xlsx")

    store_to_pickle(chronology_speechid, "chronology_speechid.pickle")
    store_to_pickle(chronology_date, "chronology_date.pickle")
Ejemplo n.º 10
0
def firststep():

    byspeaker = {}
    speakerdict = {}

    byspeaker_allspeakers = {}
    speakerdict_allspeakers = {}

    ngrams = {}

    speakers_to_consider = []

    raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))
    # dataframe = pd.DataFrame.from_dict(raw_speeches, orient = "index")
    # dataframe.columns = ['Speeches']
    speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb"))
    # file = open('num_speeches.txt', 'r')
    # num_speeches = int(file.read())
    # doc_freq = pickle.load(open("bigram_doc_freq.pickle", "rb"))
    speakers_to_analyze = load_list("Girondins and Montagnards New Mod.xlsx")

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speechid in raw_speeches:
        fulldate = speechid[0:10]
        if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"):
            speech_bigrams = compute_ngrams(raw_speeches[speechid], 2)

            speaker = speechid_to_speaker[speechid]

            print speaker

            if speaker in byspeaker_allspeakers:
                byspeaker_allspeakers[
                    speaker] = byspeaker_allspeakers[speaker] + speech_bigrams
            else:
                byspeaker_allspeakers[speaker] = speech_bigrams
            speech_bigrams = None

    with open("byspeaker_allspeakers.pickle", "wb") as handle:
        pickle.dump(byspeaker_allspeakers, handle, protocol=0)

    w = csv.writer(open("byspeaker_allspeakers.csv", "w"))
    for key, val in byspeaker.items():
        w.writerow([key, val])
    """byspeaker_allspeakers = pd.DataFrame.from_dict(byspeaker_allspeakers, orient = "index")
Ejemplo n.º 11
0
def compute_ngrams(speech, order):
	stopwords_from_file = open('FrenchStopwords.txt', 'r')
	lines = stopwords_from_file.readlines()
	french_stopwords = []
	for line in lines:
		word = line.split(',')
		#remove returns and new lines at the end of stop words so the parser catches matches
		#also remove accents so the entire analysis is done without accents
		word_to_append = remove_diacritic(unicode(word[0].replace("\n","").replace("\r",""), 'utf-8'))
		french_stopwords.append(word_to_append)

	speech = speech.replace("%"," ").replace("\\"," ").replace("^", " ").replace("=", " ").replace("]"," ").replace("\""," ").replace("``", " ").replace("-"," ").replace("[", " ").replace("{"," ").replace("$", " ").replace("~"," ").replace("-"," ").replace("}", " ").replace("&"," ").replace(">"," ").replace("#"," ").replace("/"," ").replace("\`"," ").replace("'"," ").replace("*", " ").replace("`", " ").replace(";"," ").replace("?"," ").replace(",", " ").replace(":"," ").replace("."," ").replace("("," ").replace(")"," ")
	clean_text = remove_stopwords(speech.lower(), french_stopwords)
	clean_text = clean_text.replace("mm secretaire", " ").replace("assemble nationale", " ").replace("monsieur president", " ").replace("convention nationale", " ").replace("archives parliamentaire", " ").replace("republique francaise", " ").replace("ordre jour", " ").replace("corps legislatif", " ")
	n_grams = make_ngrams(clean_text, order)
	speech_ngrams = Counter(n_grams)
	return(speech_ngrams)
Ejemplo n.º 12
0
def aggregate_by_speaker():

    byspeaker = {}
    speakerdict = {}

    ngrams = {}

    speakers_to_consider = []

    raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))
    speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb"))
    speakers_to_analyze = load_list(
        "Girondins and Montagnards New Mod Limit.xlsx")
    speaker_num_words = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speechid in raw_speeches:
        fulldate = speechid[0:10]
        if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"):
            num_words = len(raw_speeches[speechid].split())
            speech_bigrams = compute_ngrams(raw_speeches[speechid], 2)

            speaker = speechid_to_speaker[speechid]

            if speaker in speaker_num_words:
                speaker_num_words[speaker] += num_words
            else:
                speaker_num_words[speaker] = num_words

            if speaker in speakers_to_consider:
                if speaker in byspeaker:
                    byspeaker[speaker] = byspeaker[speaker] + speech_bigrams
                else:
                    byspeaker[speaker] = speech_bigrams
            speech_bigrams = None

    write_to_csv(byspeaker)
    store_to_pickle(byspeaker)

    write_to_csv(speaker_num_words)
    store_to_pickle(speaker_num_words)
Ejemplo n.º 13
0
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker):
    speakers_to_consider = []
    speaker_bigram_frequencies = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speaker_name in speakers_to_consider:
        print speaker_name
        speaker_bigram_frequencies = {}
        party = speakers_to_analyze.loc[speaker_name, "Party"]
        speech = Counter()
        for identity in raw_speeches:
            date = re.findall(date_regex, str(identity))[0]
            if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                    speaker_name == speechid_to_speaker[identity]):

                indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)

                for bigram in indv_speech_bigram:
                    if bigram in speaker_bigram_frequencies:
                        #speechid_frequencies = speaker_bigram_frequencies[bigram]
                        #speechid_frequencies[speechid] = indv_speech_bigram[bigram]
                        speaker_bigram_frequencies[bigram][
                            identity] = indv_speech_bigram[bigram]
                    else:
                        speaker_bigram_frequencies[bigram] = {}
                        speaker_bigram_frequencies[bigram][
                            identity] = indv_speech_bigram[bigram]
        filename_pickle = "" + speaker_name + "bigram_frequencies.pickle"
        with open(filename_pickle, 'wb') as handle:
            pickle.dump(speaker_bigram_frequencies, handle, protocol=0)
        filename_csv = "" + speaker_name + "bigram_frequencies.csv"
        w = csv.writer(open(filename_csv, "w"))
        for key, val in speaker_bigram_frequencies.items():
            w.writerow([key, val])
Ejemplo n.º 14
0
def calculate_chronology(raw_speeches, speechid_to_speaker, speaker_list,
                         speakers_to_analyze, Girondins, Montagnards):
    speaker_ngrams = {}
    speakers_to_consider = []
    speaker_distances = collections.defaultdict()
    chronology = collections.defaultdict(dict)
    # chronology_date = pd.DataFrame(columns = ["Bigram", "Speaker Name", "Date", "Num occurrences"])
    # chronology_speechid = pd.DataFrame(columns = ["Bigram", "Speaker Name", "Speechid", "Num occurrences"])

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    row_entry_speechid = []
    row_entry_date = []
    for identity in raw_speeches:
        date = re.findall(date_regex, str(identity))[0]
        speaker_name = speechid_to_speaker[identity]
        # print speaker_name
        if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                speaker_name in speakers_to_consider):
            party = speakers_to_analyze.loc[speaker_name, "Party"]
            indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
            for bigram in indv_speech_bigram:
                row_entry_speechid.append([
                    str(bigram), speaker_name, identity,
                    indv_speech_bigram[bigram], party
                ])
                # chronology_speechid = chronology_speechid.append(pd.Series(row_entry_speechid), ignore_index = True)
                row_entry_date.append([
                    str(bigram), speaker_name, date,
                    indv_speech_bigram[bigram], party
                ])
                # chronology_date = chronology_date.append(pd.Series(row_entry_date), ignore_index = True)
                # if bigram in chronology:
                # 	chronology[bigram].append([speaker_name, identity, indv_speech_bigram[bigram]])
                # else:
                # 	chronology[bigram] = []
                # 	chronology[bigram].append([speaker_name, identity, indv_speech_bigram[bigram]])

    chronology_speechid = pd.DataFrame(row_entry_speechid,
                                       columns=[
                                           "Bigram", "Speaker Name",
                                           "Speechid", "Num occurrences",
                                           "Party"
                                       ])
    chronology_date = pd.DataFrame(
        row_entry_date,
        columns=["Bigram", "Speaker Name", "Date", "Num occurrences", "Party"])

    # Create ngram column, speaker name, date, number of occurrences
    # Create two dataframes, one with date and one with speechid
    # Include volume number
    # Do groupby and aggregation methods

    # w = csv.writer(open("chronology.csv", "w"))
    # for key, val in chronology.items():
    # 	if (Girondins[key] >= 10) or (Montagnards[key] >= 10):
    # 		w.writerow([key,val])
    make_visualizations(chronology_date)

    # write_to = pd.ExcelWriter("chronology_speechid.xlsx")
    # chronology_speechid.to_excel(write_to, 'Sheet1')
    # write_to.save()

    # filename = pd.ExcelWriter("chronology_date.xlsx")
    # chronology_date.to_excel(write_to, 'Sheet1')
    # filename.save()

    pickle_filename_2 = "chronology_speechid.pickle"
    with open(pickle_filename_2, 'wb') as handle:
        pickle.dump(chronology_speechid, handle, protocol=0)

    pickle_filename = "chronology_date.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(chronology_date, handle, protocol=0)
Ejemplo n.º 15
0
def track_murmures_applaudissements(raw_speeches, speechid_to_speaker):
    speakers_to_analyze = load_list(
        "Girondins and Montagnards New Mod Limit.xlsx")
    speakers_to_consider = []
    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))
    murmures = []
    applaudissements = []
    Girondins_murmures = 0
    Montagnards_murmures = 0
    Girondins_applaudissements = 0
    Montagnards_applaudissements = 0
    murmures_by_date = {}
    applaudissements_by_date = {}
    total_murmures = 0
    total_applaudissements = 0
    murmures_speakers = {}
    applaudissements_speakers = {}
    for speechid, speech in raw_speeches.items():
        speaker_name = speechid_to_speaker[speechid]
        if speaker_name in speakers_to_consider:
            date = speechid[0:9]
            party = speakers_to_analyze.loc[speaker_name, "Party"]
            if "murmure" in speech:
                total_murmures += 1
                murmures.append(speechid)
                if party == "Girondins":
                    Girondins_murmures += 1
                else:
                    Montagnards_murmures += 1
                if date in murmures_by_date:
                    murmures_by_date[date] += 1
                else:
                    murmures_by_date[date] = 0
                if speaker_name in murmures_speakers:
                    murmures_speakers[speaker_name] += 1
                else:
                    murmures_speakers[speaker_name] = 0
            if "applaudissement" in speech:
                total_applaudissements += 1
                applaudissements.append(speechid)
                if party == "Girondins":
                    Girondins_applaudissements += 1
                else:
                    Montagnards_applaudissements += 1
                if date in applaudissements_by_date:
                    applaudissements_by_date[date] += 1
                else:
                    applaudissements_by_date[date] = 0
                if speaker_name in applaudissements_speakers:
                    applaudissements_speakers[speaker_name] += 1
                else:
                    applaudissements_speakers[speaker_name] = 0
        else:
            if "murmure" in speech:
                total_murmures += 1
            if "applaudissement" in speech:
                total_applaudissements += 1

    with open('gir_murmures.txt', 'w') as f:
        f.write('%d' % Girondins_murmures)
    with open('mont_murmures.txt', 'w') as f:
        f.write('%d' % Montagnards_murmures)
    print Montagnards_murmures + Girondins_murmures

    with open('total_murmures.txt', 'w') as f:
        f.write('%d' % total_murmures)
    with open('total_applaudissements.txt', 'w') as f:
        f.write('%d' % total_applaudissements)

    with open('gir_applaudissements.txt', 'w') as f:
        f.write('%d' % Girondins_applaudissements)
    with open('mont_applaudissements.txt', 'w') as f:
        f.write('%d' % Montagnards_applaudissements)
    print Montagnards_applaudissements + Girondins_applaudissements

    with open('murmures_by_date.pickle', 'wb') as handle:
        pickle.dump(murmures_by_date, handle, protocol=0)

    with open('applaudissements_by_date.pickle', 'wb') as handle:
        pickle.dump(applaudissements_by_date, handle, protocol=0)

    w = csv.writer(open("murmures_by_date.csv", "w"))
    for key, val in murmures_by_date.items():
        w.writerow([key, val])

    w = csv.writer(open("applaudissements_by_date.csv", "w"))
    for key, val in applaudissements_by_date.items():
        w.writerow([key, val])

    w = csv.writer(open("murmures_speakers.csv", "w"))
    for key, val in murmures_speakers.items():
        w.writerow([key, val])

    w = csv.writer(open("applaudissements_speakers.csv", "w"))
    for key, val in applaudissements_speakers.items():
        w.writerow([key, val])
Ejemplo n.º 16
0
    for key, val in errors_per_page.items():
        if isinstance(key, str):
            key = unicode(key, "ascii", errors="ignore")
        w.writerow([key.encode("utf-8", errors="ignore"), val[0], val[1]])

    # Save and output frequency of errors per word per volume
    store_to_pickle(word_freq_wrong, "word_freq_errors.pickle")
    w = csv.writer(open("word_freq_errors.csv", "w"))
    for key, val in word_freq_wrong.items():
        w.writerow([key, val])


if __name__ == '__main__':
    import sys
    # words = parseEnc()
    # pickle_filename = "enc_words.pickle"
    # with open(pickle_filename, 'wb') as handle:
    # 	pickle.dump(words, handle, protocol = 0)
    enc_words = pickle.load(open("enc_words.pickle", "rb"))
    stopwords_from_file = open('FrenchStopwords.txt', 'r')
    lines = stopwords_from_file.readlines()
    french_stopwords = []
    for line in lines:
        word = line.split(',')
        #remove returns and new lines at the end of stop words so the parser catches matches
        #also remove accents so the entire analysis is done without accents
        word_to_append = remove_diacritic(
            unicode(word[0].replace("\n", "").replace("\r", ""), 'utf-8'))
        french_stopwords.append(word_to_append)
    checkErrors(enc_words, french_stopwords)
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker):
	# Dataframe to keep track of the speakers we care about
	speakers_to_consider = []
	# Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches
	# and speechid_to_speaker
	for speaker in speakers_to_analyze.index.values:
		speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

	# Initialize various data frames for export to the classification script
	train_total_freq_unigram = {}
	test_total_freq_unigram = {}
	train_total_freq_bigram = {}
	test_total_freq_bigram = {}
	train_number_speeches = 0
	test_number_speeches = 0
	
	# Keeps track of which speeches contain the given bigram
	train_speeches_bigram = collections.defaultdict(dict)
	test_speeches_bigram = collections.defaultdict(dict)
	train_speeches_unigram = collections.defaultdict(dict)
	test_speeches_unigram = collections.defaultdict(dict)

	bigrams_to_speeches = collections.defaultdict()
	bigram_doc_freq = collections.defaultdict()
	unigram_doc_freq = collections.defaultdict()

	gir_num_speeches = 0
	mont_num_speeches = 0
	gir_docs = {}
	mont_docs = {}

	for speaker_name in speakers_to_consider:
		print speaker_name
		party = speakers_to_analyze.loc[speaker_name, "Party"]
		speech = Counter()
		# Variable to keep track of a given speaker's number of speeches
		speech_num = 0
		for identity in raw_speeches:
			date = re.findall(date_regex, str(identity))[0]
			# Only look at speeches within the date frame and that are from the speaker of interest
			if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]):
				# Only looking at speeches with substance, so greater than 100 characters
				if len(raw_speeches[identity]) >= 100:
					indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
					indv_speech_unigram = compute_ngrams(raw_speeches[identity], 1)
					# Splitting the data into training and test data with 1/4 of each speaker's data in the test set
					if speech_num%4 != 0:
						train_number_speeches += 1
						for bigram in indv_speech_bigram:
							augment(bigram_doc_freq, bigram)
							augment(train_total_freq_bigram, bigram)
						for unigram in indv_speech_unigram:
							augment(unigram_doc_freq, unigram)
							augment(train_total_freq_unigram, unigram)
						train_speeches_bigram[identity] = indv_speech_bigram
						train_speeches_unigram[identity] = indv_speech_unigram
					else:
						test_number_speeches += 1
						for bigram in indv_speech_bigram:
							augment(test_total_freq_bigram, bigram)
						for unigram in indv_speech_unigram:
							augment(test_total_freq_unigram, unigram)
						test_speeches_bigram[identity] = indv_speech_bigram
						test_speeches_unigram[identity] = indv_speech_unigram

					speech_num += 1
		
	# Write all relevant data objects and values to memory to use when running classification
	store_to_pickle(speakers_to_analyze, "speakers_to_analyze.pickle")
	
	# Set these dataframes to None to conserve memory
	speakers_to_analyze = None
	speechid_to_speaker = None
	raw_speeches = None

	store_to_pickle(train_speeches_bigram, "train_speeches_bigram.pickle")
	store_to_pickle(train_speeches_unigram, "train_speeches_unigram.pickle")
	store_to_pickle(train_total_freq_bigram, "train_total_freq_bigram.pickle")
	store_to_pickle(train_total_freq_unigram, "train_total_freq_unigram.pickle")

	store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle")
	store_to_pickle(unigram_doc_freq, "unigram_doc_freq.pickle")
	store_to_pickle(train_number_speeches, "train_number_speeches.pickle")

	store_to_pickle(test_speeches_bigram, "test_speeches_bigram.pickle")
	store_to_pickle(test_speeches_unigram, "test_speeches_unigram.pickle")
	store_to_pickle(test_total_freq_bigram, "test_total_freq_bigram.pickle")
	store_to_pickle(test_total_freq_unigram, "test_total_freq_unigram.pickle")
def read_names(name_file):
	pd_list = pd.read_excel(name_file)
	pd_list = pd_list.set_index('Last Name')
	speakers = pd_list.index.tolist()
	for speaker in speakers:
		ind = speakers.index(speaker)
		speakers[ind] = remove_diacritic(speaker).decode('utf-8').lower()
	pd_list.index = speakers
	full_names = []
	for full_name in pd_list["Full Name"]:
		full_names.append(remove_diacritic(full_name).decode('utf-8').lower())
	pd_list["Full Name"] = full_names
	speakers_to_remove = []
	speakers_to_keep = []

	# Need to look if dates of speaker are within the timeframe of the Girondins/Montgnards
	for j, speaker in enumerate(pd_list.index.values):
		valid_date = False
		depute_de = pd_list["Depute de"].iloc[j]
		if depute_de == 1792.0 or depute_de == 1793.0:
			valid_date = True
		depute_a = pd_list["Depute a"].iloc[j]
		if depute_a == 1792.0 or depute_a == 1793.0:
			valid_date = True
		if (depute_de <= 1792.0 and depute_a >= 1792.0) or (depute_de <= 1793.0 and depute_a >= 1793.0):
			valid_date = True
		depute_de2 = pd_list["Depute puis de 2"].iloc[j]
		if depute_de2:
			if depute_de2 == 1792.0 or depute_de2 == 1793.0:
				valid_date = True
		depute_a2 = pd_list["Depute a 2"].iloc[j]
		if depute_a2:
			if depute_a2 == 1792 or depute_a2 == 1793.0:
				valid_date = True
		if depute_de2 and depute_a2:
			if (depute_de2 <= 1792.0 and depute_a2 >= 1792.0) or (depute_de2 <= 1793.0 and depute_a2 >= 1793.0):
				valid_date = True
		depute_de3 = pd_list["Depute puis de 3"].iloc[j]
		if depute_de3:
			if depute_de3 == 1792.0 or depute_de3 == 1793.0:
				valid_date = True
		depute_a3 = pd_list["Depute a 3"].iloc[j]
		if depute_a3:
			if depute_a3 == 1792.0 or depute_a3 == 1793.0:
				valid_date = True
		if depute_de3 and depute_a3:
			if (depute_de3 <= 1792.0 and depute_a3 >= 1792.0) or (depute_de3 <= 1793.0 and depute_a3 >= 1793.0):
				valid_date = True


		if valid_date == False:
			speakers_to_remove.append(j)
		if valid_date == True:
			speakers_to_keep.append(j)


	pd_list = pd_list.iloc[speakers_to_keep]
	pickle_filename = "dated_names.pickle"
	with open(pickle_filename, 'wb') as handle:
		pickle.dump(pd_list, handle, protocol = 0)
	return pd_list
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker,
              Girondins, Montagnards, Plein):
    speaker_names = set()
    speaker_num_speeches = {}
    speaker_char_count = {}
    speakers_to_consider = []

    bigrams_to_speeches = {}
    bigrams_to_speakers = {}
    bigram_doc_freq = collections.defaultdict()

    gir_num_speeches = 0
    mont_num_speeches = 0
    gir_docs = {}
    mont_docs = {}
    plein_docs = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speaker_name in speakers_to_consider:
        print speaker_name
        party = speakers_to_analyze.loc[speaker_name, "Party"]
        speech = Counter()
        for identity in raw_speeches:
            date = re.findall(date_regex, str(identity))[0]
            if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                    speaker_name == speechid_to_speaker[identity]):
                # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker
                # To potentially establish a cutoff for analysis purposes
                augment(speaker_num_speeches, speaker_name)
                if speaker_name in speaker_char_count:
                    speaker_char_count[speaker_name] += len(
                        raw_speeches[identity])
                else:
                    speaker_char_count[speaker_name] = len(
                        raw_speeches[identity])

                indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)

                for bigram in indv_speech_bigram:
                    augment(bigram_doc_freq, bigram)

                    # Maintains a list of speeches in which given bigrams are spoken in
                    if bigram in bigrams_to_speeches:
                        bigrams_to_speeches[bigram].append(identity)
                    else:
                        bigrams_to_speeches[bigram] = []
                        bigrams_to_speeches[bigram].append(identity)
                    if bigram in bigrams_to_speakers:
                        bigrams_to_speakers[bigram].add(speaker_name)
                    else:
                        bigrams_to_speakers[bigram] = set()
                        bigrams_to_speakers[bigram].add(speaker_name)

                # Augments the relevant variables according to the party the speaker belongs to
                if party == "Girondins":
                    gir_num_speeches += 1
                    gir_docs = check_num_speakers(indv_speech_bigram,
                                                  speaker_name, gir_docs)
                    try:
                        Girondins = Girondins + indv_speech_bigram
                    except NameError:
                        Girondins = indv_speech_bigram
                else:
                    mont_num_speeches += 1
                    mont_docs = check_num_speakers(indv_speech_bigram,
                                                   speaker_name, mont_docs)
                    try:
                        Montagnards = Montagnards + indv_speech_bigram
                    except NameError:
                        Montagnards = indv_speech_bigram

                #speech = speech + indv_speech_bigram

    # 	# Stores the bigram Counter object for each individual speaker
    # 	"""pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
    # 	with open(pickle_filename, 'wb') as handle:
    # 		pickle.dump(speech, handle, protocol = 0)"""

    # Stores the bigrams_to_speeches document in Excel
    df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches,
                                                    orient="index")
    write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')
    pickle_filename = "bigrams_to_speakers.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(bigrams_to_speakers, handle, protocol=0)

    pickle_filename = "bigrams_to_speeches.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(bigrams_to_speeches, handle, protocol=0)

    pickle_filename = "bigrams_to_speeches.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(bigrams_to_speeches, handle, protocol=0)

    pickle_filename = "gir_docs.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(gir_docs, handle, protocol=0)

    pickle_filename = "mont_docs.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(mont_docs, handle, protocol=0)

    # bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb"))
    # bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb"))

    # gir_docs = pickle.load(open("gir_docs.pickle", "rb"))
    # mont_docs = pickle.load(open("mont_docs.pickle", "rb"))

    # Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb"))
    # Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb"))

    bigram_num_speakers = []
    bigram_num_speeches = []
    bigram_total_freq = []
    bg_speeches = {}
    bigrams = []
    speeches = []
    speakers = []
    for bigram in bigrams_to_speeches:
        if (Girondins[bigram] >= 10) or (Montagnards[bigram] >= 10):
            bigram_num_speakers.append(len(bigrams_to_speakers[bigram]))
            bigram_num_speeches.append(len(bigrams_to_speeches[bigram]))
            bigram_total_freq.append(Girondins[bigram] + Montagnards[bigram])
            bigrams.append(str(bigram))
            speeches.append(str(bigrams_to_speeches[bigram]))
            speakers.append(str(bigrams_to_speakers[bigram]))

    bg_num_speakers = pd.DataFrame(bigram_num_speakers,
                                   columns=['Num Speakers'])
    bg_num_speeches = pd.DataFrame(bigram_num_speeches,
                                   columns=['Num Speeches'])
    bg_total_freq = pd.DataFrame(bigram_total_freq, columns=['Total count'])
    bgs = pd.DataFrame(bigrams, columns=["Bigram"])
    speech = pd.DataFrame(speeches, columns=["Speechids"])
    speaker = pd.DataFrame(speakers, columns=["Speakers"])

    bigram_info = pd.DataFrame()
    bigram_info = pd.concat([
        bgs, speech, speaker, bg_num_speeches, bg_num_speakers, bg_total_freq
    ],
                            axis=1)
    writer = pd.ExcelWriter("bigram_info.xlsx")
    bigram_info.to_excel(writer, 'Sheet1')
    writer.save()

    w = csv.writer(open("bigrams_to_speeches_noplein.csv", "w"))
    for key, val in bigrams_to_speeches.items():
        w.writerow([key, val])

    bigrams_to_speakers_noplein_sorted = sorted(bigrams_to_speakers.items(),
                                                key=lambda x: len(x[1]),
                                                reverse=True)
    w = csv.writer(open("bigrams_to_speakers_noplein_sorted.csv", "w"))
    for item in bigrams_to_speakers_noplein_sorted:
        w.writerow([item[0], item[1]])

    # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors
    # num_speeches = 4479
    # bigram_doc_freq = pickle.load(open("bigram_doc_freq_noplein_withlimit.pickle", 'rb'))

    with open('gir_speeches_noplein_withlimit.txt', 'w') as f:
        f.write('%d' % gir_num_speeches)
    with open('mont_speeches_noplein_withlimit.txt', 'w') as f:
        f.write('%d' % mont_num_speeches)
    print num_speeches

    with open('speaker_num_speeches_withlimit.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    with open('speaker_char_count_withlimit.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    w = csv.writer(open("speaker_num_speeches_withlimit.csv", "w"))
    for key, val in speaker_num_speeches.items():
        w.writerow([key, val])

    w = csv.writer(open("speaker_char_count_withlimit.csv", "w"))
    for key, val in speaker_char_count.items():
        w.writerow([key, val])

    # Write the number of speeches and doc_frequency to memory for use in further analysis
    with open('num_speeches_noplein_withlimit.txt', 'w') as f:
        f.write('%d' % num_speeches)
    df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index")
    write_to_excel(df_doc_freq, 'doc_freq.xlsx')

    with open("bigram_doc_freq_noplein_withlimit.pickle", 'wb') as handle:
        pickle.dump(bigram_doc_freq, handle, protocol=0)

    # # Girondins = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)}
    # # Montagnards = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)}

    # with open("Girondins_withlimit.pickle", 'wb') as handle:
    # 	pickle.dump(Girondins, handle, protocol = 0)
    # with open("Montagnards_withlimit.pickle", 'wb') as handle:
    # 	pickle.dump(Montagnards, handle, protocol = 0)
    # gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
    # mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

    # """with open("gir_tfidf.pickle", 'wb') as handle:
    # 	pickle.dump(gir_tfidf, handle, protocol = 0)
    # with open("mont_tfidf.pickle", 'wb') as handle:
    # 	pickle.dump(mont_tfidf, handle, protocol = 0)"""

    # # Computes the distance between the tf_idf vectors
    # #compute_distance(gir_tfidf, mont_tfidf)

    # # Stores the tf_idf vectors
    # df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index")
    # #df_gir_tfidf.columns = ['Bigrams', 'tfidf']
    # write_to_excel(df_gir_tfidf, 'gir_tfidf_withlimit.xlsx')
    # df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index")
    # #df_mont_tfidf.columns = ['Bigrams', 'tfidf']
    # write_to_excel(df_mont_tfidf, 'mont_tfidf_withlimit.xlsx')

    # df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
    # df_tfidf_combined = df_tfidf_combined.transpose()
    # df_tfidf_combined.columns = ["Girondins", "Montagnards"]
    # write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx')

    # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches
    # print gir_docs
    Girondins = {k: v
                 for k, v in Girondins.items()
                 if (v >= 10)}  #and (len(gir_docs[k]) > 1)}
    df_girondins = pd.DataFrame.from_dict(Girondins, orient="index")
    write_to_excel(df_girondins, "Girondins_counts_withlimit.xlsx")

    Montagnards = {k: v
                   for k, v in Montagnards.items()
                   if (v >= 10)}  #and (len(mont_docs[k]) > 1)}
    df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index")
    write_to_excel(df_montagnards, "Montagnards_counts_withlimit.xlsx")

    gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
    mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

    # # Normalizes the vectors and computes the distance between them
    # #normalized = normalize_dicts(Girondins, Montagnards)
    # #compute_distance(normalized[0], normalized[1])

    # Stores the Girondins and Montagnards frequency vectors in the same document
    df_combined = pd.DataFrame([Girondins, Montagnards])
    df_combined = df_combined.transpose()
    df_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_combined, 'combined_frequency_withlimit.xlsx')

    df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
    df_tfidf_combined = df_tfidf_combined.transpose()
    df_tfidf_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx')
Ejemplo n.º 20
0
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker):
    speaker_names = set()
    speakers_to_consider = []

    # Initialize various data frames for export to the classification script
    train_total_freq_unigram = {}
    test_total_freq_unigram = {}
    train_total_freq_bigram = {}
    test_total_freq_bigram = {}
    train_number_speeches = 0
    test_number_speeches = 0
    # Keeps track of which speeches contain the given bigram
    train_speeches_bigram = collections.defaultdict(dict)
    test_speeches_bigram = collections.defaultdict(dict)
    train_speeches_unigram = collections.defaultdict(dict)
    test_speeches_unigram = collections.defaultdict(dict)

    bigrams_to_speeches = collections.defaultdict()
    bigram_doc_freq = collections.defaultdict()
    unigram_doc_freq = collections.defaultdict()

    gir_num_speeches = 0
    mont_num_speeches = 0
    gir_docs = {}
    mont_docs = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speaker_name in speakers_to_consider:
        print speaker_name
        party = speakers_to_analyze.loc[speaker_name, "Party"]
        speech = Counter()
        speech_num = 0
        for identity in raw_speeches:
            date = re.findall(date_regex, str(identity))[0]
            if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                    speaker_name == speechid_to_speaker[identity]):
                # Only looking at speeches with substance, so greater than 100 characters
                if len(raw_speeches[identity]) >= 100:
                    indv_speech_bigram = compute_ngrams(
                        raw_speeches[identity], 2)
                    indv_speech_unigram = compute_ngrams(
                        raw_speeches[identity], 1)
                    # Splitting the data into training and test data with 1/4 of each speaker's data in the test set
                    if speech_num % 4 != 0:
                        train_number_speeches += 1
                        for bigram in indv_speech_bigram:
                            augment(bigram_doc_freq, bigram)
                            augment(train_total_freq_bigram, bigram)
                        for unigram in indv_speech_unigram:
                            augment(unigram_doc_freq, unigram)
                            augment(train_total_freq_unigram, unigram)
                        train_speeches_bigram[identity] = indv_speech_bigram
                        train_speeches_unigram[identity] = indv_speech_unigram
                    else:
                        test_number_speeches += 1
                        for bigram in indv_speech_bigram:
                            augment(test_total_freq_bigram, bigram)
                        for unigram in indv_speech_unigram:
                            augment(test_total_freq_unigram, unigram)
                        test_speeches_bigram[identity] = indv_speech_bigram
                        test_speeches_unigram[identity] = indv_speech_unigram

                    speech_num += 1

    # Write all relevant data objects and values to memory to use when running classification
    with open("speechid_to_speaker_store.pickle", 'wb') as handle:
        pickle.dump(speechid_to_speaker, handle, protocol=0)
    speechid_to_speaker = None
    with open("speakers_to_analyze_store.pickle", 'wb') as handle:
        pickle.dump(speakers_to_analyze, handle, protocol=0)
    speakers_to_analyze = None
    raw_speeches = None

    with open("train_speeches_bigram.pickle", 'wb') as handle:
        pickle.dump(train_speeches_bigram, handle, protocol=0)
    with open("train_speeches_unigram.pickle", 'wb') as handle:
        pickle.dump(train_speeches_unigram, handle, protocol=0)
    with open("train_total_freq_bigram.pickle", 'wb') as handle:
        pickle.dump(train_total_freq_bigram, handle, protocol=0)
    with open("train_total_freq_unigram.pickle", 'wb') as handle:
        pickle.dump(train_total_freq_unigram, handle, protocol=0)

    with open("bigram_doc_freq.pickle", 'wb') as handle:
        pickle.dump(bigram_doc_freq, handle, protocol=0)
    with open("unigram_doc_freq.pickle", 'wb') as handle:
        pickle.dump(unigram_doc_freq, handle, protocol=0)
    with open("train_number_speeches.pickle", 'wb') as handle:
        pickle.dump(train_number_speeches, handle, protocol=0)

    with open("test_speeches_bigram.pickle", 'wb') as handle:
        pickle.dump(test_speeches_bigram, handle, protocol=0)
    with open("test_speeches_unigram.pickle", 'wb') as handle:
        pickle.dump(test_speeches_unigram, handle, protocol=0)
    with open("test_total_freq_bigram.pickle", 'wb') as handle:
        pickle.dump(test_total_freq_bigram, handle, protocol=0)
    with open("test_total_freq_unigram.pickle", 'wb') as handle:
        pickle.dump(test_total_freq_unigram, handle, protocol=0)
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker,
              Girondins, Montagnards, Plein):
    speaker_names = set()
    speaker_num_speeches = {}
    speaker_char_count = {}
    speakers_to_consider = []

    bigrams_to_speeches = collections.defaultdict()
    bigram_doc_freq = collections.defaultdict()

    gir_num_speeches = 0
    mont_num_speeches = 0
    plein_num_speeches = 0
    gir_docs = {}
    mont_docs = {}
    plein_docs = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for identity in raw_speeches:
        date = re.findall(date_regex, str(identity))[0]
        if (date >= "1792-09-20") and (date <= "1793-06-02"):
            # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker
            # To potentially establish a cutoff for analysis purposes
            speaker_name = speechid_to_speaker[identity]
            party = ""
            if speaker_name in speakers_to_consider:
                party = speakers_to_analyze.loc[speaker_name, "Party"]
            else:
                party = "Plein"
            augment(speaker_num_speeches, speaker_name)
            if speaker_name in speaker_char_count:
                speaker_char_count[speaker_name] += len(raw_speeches[identity])
            else:
                speaker_char_count[speaker_name] = len(raw_speeches[identity])
            indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
            for bigram in indv_speech_bigram:
                augment(bigram_doc_freq, bigram)

                # Maintains a list of speeches in which given bigrams are spoken in
                if bigram in bigrams_to_speeches:
                    bigrams_to_speeches[bigram].append(identity)
                else:
                    bigrams_to_speeches[bigram] = []
                    bigrams_to_speeches[bigram].append(identity)

            # Augments the relevant variables according to the party the speaker belongs to
            if party == "Girondins":
                gir_num_speeches += 1
                gir_docs = check_num_speakers(indv_speech_bigram, speaker_name,
                                              gir_docs)
                try:
                    Girondins = Girondins + indv_speech_bigram
                except NameError:
                    Girondins = indv_speech_bigram
            elif party == "Montagnards":
                mont_num_speeches += 1
                mont_docs = check_num_speakers(indv_speech_bigram,
                                               speaker_name, mont_docs)
                try:
                    Montagnards = Montagnards + indv_speech_bigram
                except NameError:
                    Montagnards = indv_speech_bigram
            # Creates a Plein category that is neither Girondins or Montagnards to better understand speakers that are not distinctly one
            # or the other
            else:
                plein_num_speeches += 1
                plein_docs = check_num_speakers(indv_speech_bigram,
                                                speaker_name, plein_docs)
                try:
                    Plein = Plein + indv_speech_bigram
                except NameError:
                    Plein = indv_speech_bigram

                #speech = speech + indv_speech_bigram

        # Stores the bigram Counter object for each individual speaker
        """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
		with open(pickle_filename, 'wb') as handle:
			pickle.dump(speech, handle, protocol = 0)"""
    """# Stores the bigrams_to_speeches document in Excel
	df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index")
	write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')"""

    # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors
    num_speeches = gir_num_speeches + mont_num_speeches + plein_num_speeches
    print num_speeches

    with open('speaker_num_speeches_withplein.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    with open('speaker_char_count_withplein.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    w = csv.writer(open("speaker_num_speeches_withplein.csv", "w"))
    for key, val in speaker_num_speeches.items():
        w.writerow([key, val])

    w = csv.writer(open("speaker_char_count_withplein.csv", "w"))
    for key, val in speaker_char_count.items():
        w.writerow([key, val])

    # Write the number of speeches and doc_frequency to memory for use in further analysis
    with open('num_speeches_withplein.txt', 'w') as f:
        f.write('%d' % num_speeches)
    df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index")
    write_to_excel(df_doc_freq, 'doc_freq.xlsx')

    with open("bigram_doc_freq_withplein.pickle", 'wb') as handle:
        pickle.dump(bigram_doc_freq, handle, protocol=0)

    with open("Girondins_withplein.pickle", 'wb') as handle:
        pickle.dump(Girondins, handle, protocol=0)
    with open("Montagnards_withplein.pickle", 'wb') as handle:
        pickle.dump(Montagnards, handle, protocol=0)
    with open("Plein.pickle", 'wb') as handle:
        pickle.dump(Plein, handle, protocol=0)
    gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
    mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)
    plein_tfidf = compute_tfidf(Plein, num_speeches, bigram_doc_freq)
    """with open("gir_tfidf.pickle", 'wb') as handle:
		pickle.dump(gir_tfidf, handle, protocol = 0)
	with open("mont_tfidf.pickle", 'wb') as handle:
		pickle.dump(mont_tfidf, handle, protocol = 0)"""

    # Computes the distance between the tf_idf vectors
    #compute_distance(gir_tfidf, mont_tfidf)

    # Stores the tf_idf vectors
    df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient="index")
    #df_gir_tfidf.columns = ['Bigrams', 'tfidf']
    write_to_excel(df_gir_tfidf, 'gir_tfidf_withplein.xlsx')
    df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient="index")
    #df_mont_tfidf.columns = ['Bigrams', 'tfidf']
    write_to_excel(df_mont_tfidf, 'mont_tfidf_withplein.xlsx')
    df_plein_tfidf = pd.DataFrame.from_dict(plein_tfidf, orient="index")
    #df_mont_tfidf.columns = ['Bigrams', 'tfidf']
    write_to_excel(df_plein_tfidf, 'plein_tfidf.xlsx')

    df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
    df_tfidf_combined = df_tfidf_combined.transpose()
    df_tfidf_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_tfidf_combined, 'combined_tfidf_withplein.xlsx')

    # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches
    Girondins = {k: v
                 for k, v in Girondins.items()
                 if (v >= 3)}  #and (len(gir_docs[k]) > 1)}
    df_girondins = pd.DataFrame.from_dict(Girondins, orient="index")
    write_to_excel(df_girondins, "Girondins_counts_withplein.xlsx")

    Montagnards = {k: v
                   for k, v in Montagnards.items()
                   if (v >= 3)}  #and (len(mont_docs[k]) > 1)}
    df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index")
    write_to_excel(df_montagnards, "Montagnards_counts_withplein.xlsx")

    # Normalizes the vectors and computes the distance between them
    #normalized = normalize_dicts(Girondins, Montagnards)
    #compute_distance(normalized[0], normalized[1])

    # Stores the Girondins and Montagnards frequency vectors in the same document
    df_combined = pd.DataFrame([Girondins, Montagnards])
    df_combined = df_combined.transpose()
    df_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_combined, 'combined_frequency.xlsx')
Ejemplo n.º 22
0
def build_vectors(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, gir_tfidf, mont_tfidf, num_speeches, doc_freq):
	speaker_ngrams = {}
	speakers_to_consider = []
	speaker_distances = collections.defaultdict()
	chronology = collections.defaultdict(dict)

	for speaker in speakers_to_analyze.index.values:
		speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

	for identity in raw_speeches:
		date = re.findall(date_regex, str(identity))[0]
		speaker_name = speechid_to_speaker[identity]
		if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name in speakers_to_consider):
			indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
			if speaker_name in speaker_ngrams:
				speaker_ngrams[speaker_name] = speaker_ngrams[speaker_name] + indv_speech_bigram
			else:
				speaker_ngrams[speaker_name] = indv_speech_bigram
		"""
		if speaker_name in chronology:
			pairing = chronology[speaker_name]
			for bigram in indv_speech_bigram:
				if bigram in pairing:
					pairing[bigram].append([identity, indv_speech_bigram[bigram]])
				else:
					pairing[bigram] = [identity, indv_speech_bigram[bigram]]
		else:
			chronology[speaker_name] = {}
			pairing = chronology[speaker_name]
			for bigram in indv_speech_bigram:
				pairing[bigram] = []
				# stores the unique speechid alongside the number of times that bigram is said in that speech for each bigram
				pairing[bigram] = [identity, indv_speech_bigram[bigram]]"""

	
	## Need tf-idf vectors for gir and mont
	## Need the doc_freq for the previous calcuations
	## compute tf-idf for individual speakers
	## compute cosine distance based on those vectors (dot product over length of vectors)
	## compute cosine similarity between the difference between the two group vectors (subtract from each other)
	## A - B, if positive more like A, if negative more like B

	## create tf vector for each speech and store that so can just add
	## Separately store single idf vector

	#########

	gir_dict = convert_keys_to_string(gir_tfidf)
	mont_dict = convert_keys_to_string(mont_tfidf)
	doc_freq_dict = convert_keys_to_string(doc_freq)
	gir_mont_diff = compute_difference(gir_dict, mont_dict)
	#gir_dict = gir_tfidf
	#print gir_dict
	#mont_dict = mont_tfidf
	for speaker in speaker_ngrams:
		speaker_dict = convert_keys_to_string(speaker_ngrams[speaker])
		to_compare = compute_tfidf(speaker_dict, num_speeches, doc_freq_dict)
		gir_dist = cosine_similarity(gir_dict, to_compare)
		mont_dist = cosine_similarity(mont_dict, to_compare)
		# Need to actually compute the distance
		gir_mont_diff_dist = cosine_similarity(gir_mont_diff, to_compare)
		speaker_distances[speaker] = [gir_dist, mont_dist, gir_mont_diff_dist]

	"""
	#speaker_dict = {(str(k),v) for k,v in speaker_ngrams['Francois Chabot']}
	speaker_dict = convert_keys_to_string(speaker_ngrams['Francois Chabot'])
	to_compare = compute_tfidf(speaker_dict, num_speeches, doc_freq)
	gir_dist = cosine_similarity(gir_dict, to_compare)
	df = pd.DataFrame([to_compare, gir_dict])
	df = df.transpose()
	write_to_excel(df, "Francois Chabot Test.xlsx")"""

	
	"""for speaker in speaker_ngrams:
		#to_compare = {k:v for k,v in speaker_ngrams[speaker].items() if (v >= 3)}
		to_compare = speaker_ngrams[speaker]
		gir_dict = gir_tfidf
		mont_dict = mont_tfidf
		gir_normalized = normalize_dicts(to_compare, gir_dict)
		gir_dist = 	compute_distance(gir_normalized[0], gir_normalized[1])
		to_compare = speaker_ngrams[speaker]
		mont_normalized = normalize_dicts(to_compare, mont_dict)
		mont_dist = compute_distance(mont_normalized[0], mont_normalized[1])
		speaker_distances[speaker] = [gir_dist, mont_dist]"""

	

	
	pickle_filename_3 = "speaker_ngrams.pickle"
	with open(pickle_filename_3, 'wb') as handle:
		pickle.dump(speaker_ngrams, handle, protocol = 0)

	df = pd.DataFrame.from_dict(speaker_distances)
	df = df.transpose()
	df.columns = ["dist to Girondins", "dist to Montagnards", "dist to difference"]
	filename = "freq_dist_map.xlsx"
	writer = pd.ExcelWriter(filename)
	df.to_excel(writer, 'Sheet1')
	writer.save()

	pickle_filename = "freq_dist.pickle"
	with open(pickle_filename, 'wb') as handle:
		pickle.dump(speaker_distances, handle, protocol = 0)

	"""df2 = pd.DataFrame.from_dict(chronology)
Ejemplo n.º 23
0
def checkErrors(enc_words, french_stopwords):
    files = os.listdir("AP_ARTFL_vols/")
    errors_per_vol = {}
    errors_per_page = {}
    word_freq_wrong = {}

    for filename in files:
        if filename.endswith(".xml"):
            filename = open('AP_ARTFL_vols/' + filename, "r")
            volno = re.findall(vol_regex, str(filename))[0]
            print volno
            contents = filename.read()
            soup = BeautifulSoup(contents, 'lxml')

            num_errors = 0
            num_words_vol = 0
            word_freq = {}

            # Iterate through contents and find all page tags
            pb_tags = []
            last_index = 0
            while True:
                loc = contents.find("<pb n=", last_index)
                if loc == -1:
                    break
                pb_tags.append(loc)
                last_index = loc + 1

            # Iterates through all page tags and looks through the contents on each page, checking each word against the
            # words contained in the Encyclodpedie
            for i in range(0, len(pb_tags) - 1):
                contents_substr = contents[pb_tags[i]:pb_tags[i + 1]]
                page_num = BeautifulSoup(contents_substr,
                                         'lxml').find_all('pb')
                pb_soup = BeautifulSoup(contents_substr, 'lxml')

                pageno = volno + "_pg" + page_num[0].get("n")
                error_per_page = 0
                num_words_pg = 0

                text = unicode(contents_substr, "ascii", errors="ignore")
                text = remove_diacritic(text).decode('utf-8')
                paragraph = remove_stopwords(text, french_stopwords)
                paragraph = paragraph.replace("\n", " ").replace(
                    ")", "").replace("*", "").replace(":", "").replace(
                        "-", "").replace("_", "").replace("(", "").replace(
                            "& ",
                            "").replace("; ", "").replace(".", "").replace(
                                ",", "").replace("?", "").replace("!", "")
                paragraph = re.sub(r'([0-9]{1,4})', ' ', paragraph)
                words = paragraph.split(" ")
                num_words_vol += len(words)
                num_words_pg += len(words)
                for word in words:
                    if word not in enc_words:
                        if word in word_freq:
                            word_freq[word] += 1
                        else:
                            word_freq[word] = 1
                        error_per_page += 1
                        num_errors += 1

                errors_per_page[pageno] = [error_per_page, num_words_pg]

            word_freq_wrong[volno] = sorted(word_freq.items(),
                                            key=lambda kv: kv[1])
            errors_per_vol[volno] = [num_errors, num_words_vol]

    # Save and output errors per volume
    store_to_pickle(errors_per_vol, "errors_per_vol.pickle")
    w = csv.writer(open("errors_per_vol.csv", "w"))
    for key, val in errors_per_vol.items():
        if isinstance(key, str):
            key = unicode(key, "ascii", errors="ignore")
        w.writerow([key, val[0], val[1]])

    # Save and output errors per page
    store_to_pickle(error_per_page, "errors_per_page.pickle")
    w = csv.writer(open("errors_per_page.csv", "w"))
    for key, val in errors_per_page.items():
        if isinstance(key, str):
            key = unicode(key, "ascii", errors="ignore")
        w.writerow([key.encode("utf-8", errors="ignore"), val[0], val[1]])

    # Save and output frequency of errors per word per volume
    store_to_pickle(word_freq_wrong, "word_freq_errors.pickle")
    w = csv.writer(open("word_freq_errors.csv", "w"))
    for key, val in word_freq_wrong.items():
        w.writerow([key, val])
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards):
	speaker_num_speeches = {}
	speaker_char_count = {}
	
	# Dataframe to keep track of the speakers we care about
	speakers_to_consider = []
	# Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches
	# and speechid_to_speaker
	for speaker in speakers_to_analyze.index.values:
		speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

	# Matches bigrams to the list of speakers and speeches that have that bigram
	bigrams_to_speeches = {}
	bigrams_to_speakers = {}

	# Maintains the number of documents a given bigram is spoken in for use with tf-idf
	bigram_doc_freq = collections.defaultdict()

	gir_num_speeches = 0
	mont_num_speeches = 0
	gir_docs = {}
	mont_docs = {}

	for speaker_name in speakers_to_consider:
		print speaker_name
		party = speakers_to_analyze.loc[speaker_name, "Party"]
		speech = Counter()
		for identity in raw_speeches:
			date = re.findall(date_regex, str(identity))[0]
			if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]):
				# Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker
				# To potentially establish a cutoff for analysis purposes
				augment(speaker_num_speeches, speaker_name)
				if speaker_name in speaker_char_count:
					speaker_char_count[speaker_name] += len(raw_speeches[identity])
				else:
					speaker_char_count[speaker_name] = len(raw_speeches[identity])

				indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)

				for bigram in indv_speech_bigram:
					augment(bigram_doc_freq, bigram)

					# Maintains a list of speeches in which given bigrams are spoken in
					if bigram in bigrams_to_speeches:
						bigrams_to_speeches[bigram].append(identity)
					else:
						bigrams_to_speeches[bigram] = []
						bigrams_to_speeches[bigram].append(identity)
					if bigram in bigrams_to_speakers:
						bigrams_to_speakers[bigram].add(speaker_name)
					else:
						bigrams_to_speakers[bigram] = set()
						bigrams_to_speakers[bigram].add(speaker_name)

				# Augments the relevant variables according to the party the speaker belongs to
				if party == "Girondins":
					gir_num_speeches += 1
					gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs)
					try:
						Girondins = Girondins + indv_speech_bigram
					except NameError:
						Girondins = indv_speech_bigram
				else:
					mont_num_speeches += 1
					mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs)
					try:
						Montagnards = Montagnards + indv_speech_bigram
					except NameError:
						Montagnards = indv_speech_bigram
			
				### Maintains a Counter of all the bigrams and their counts for a given speaker
				# speech = speech + indv_speech_bigram

	### Stores the bigram Counter object for each individual speaker
		# pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
		# with open(pickle_filename, 'wb') as handle:
		# 	pickle.dump(speech, handle, protocol = 0)

	# Store raw counts
	store_to_pickle(Girondins,"Girondins.pickle")
	store_to_pickle(Montagnards, "Montagnards.pickle")

	# Store in memory aggregate information about each bigram
	bigram_aggregate_info(Girondins, Montagnards, bigrams_to_speakers, bigrams_to_speeches)


	### If data has already been stored to memory, the lines below can be used
	# bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb"))
	# bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb"))

	# gir_docs = pickle.load(open("gir_docs.pickle", "rb"))
	# mont_docs = pickle.load(open("mont_docs.pickle", "rb"))

	# Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb"))
	# Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb"))

	# bigram_doc_freq = pickle.load(open("bigram_doc_freq.pickle", 'rb'))

	num_speeches = 4479

	# Computes counts and tfidf scores for each party and outputs for further analysis in R
	counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq)



	""" EVERYTHING BELOW IS STORING DATA TO MEMORY """
	
	# Stores the bigrams_to_speeches document in Excel
	df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index")
	write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')
	df_bigrams_to_speakers = pd.DataFrame.from_dict(bigrams_to_speakers, orient = "index")
	write_to_excel(df_bigrams_to_speakers, 'bigrams_to_speakers.xlsx')
	df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient = "index")
	write_to_excel(df_doc_freq, 'doc_freq.xlsx')
	
	# Stores files in memory
	store_to_pickle(bigrams_to_speakers, "bigrams_to_speakers.pickle")
	store_to_pickle(bigrams_to_speeches, "bigrams_to_speeches.pickle")
	store_to_pickle(gir_docs, "gir_docs.pickle")
	store_to_pickle(mont_docs, "mont_docs.pickle")
	store_to_pickle(speaker_num_speeches, "speaker_num_speeches.pickle")
	store_to_pickle(speaker_char_count, "speaker_char_count.pickle")
	store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle")

	with open('gir_speeches.txt', 'w') as f:
		f.write('%d' % gir_num_speeches)
	with open('mont_speeches.txt', 'w') as f:
		f.write('%d' % mont_num_speeches)

	write_to_csv(speaker_num_speeches, "speaker_num_speeches.csv")
	write_to_csv(speaker_char_count, "speaker_char_count.csv")

	with open('num_speeches.txt', 'w') as f:
		f.write('%d' % num_speeches)
Ejemplo n.º 25
0
def findSpeeches(raw_speeches, daily_soup, date, volno):
    id_base = date.replace("/", "_")
    number_of_speeches = 0
    for talk in daily_soup.find_all('sp'):
        # Tries to extract the speaker name and edits it for easier pairing with the Excel file
        try:
            speaker = talk.find('speaker').get_text()
            speaker = remove_diacritic(speaker).decode('utf-8')
            speaker = speaker.replace(".", "").replace(":", "").replace(
                "MM ", "").replace("MM. ", "").replace("M ", "").replace(
                    "de ", "").replace("M. ", "").replace("M, ", "").replace(
                        "M- ", "").replace("M; ", "").replace("M* ", "")
            if speaker.endswith(","):
                speaker = speaker[:-1]
            if speaker.endswith(", "):
                speaker = speaker[:-1]
            if speaker.startswith(' M. '):
                speaker = speaker[3:]
            if speaker.startswith(' '):
                speaker = speaker[1:]
            if speaker.endswith(' '):
                speaker = speaker[:-1]
        except AttributeError:
            speaker = ""

        # Piece together full speech if in multiple paragraph tags
        speech = talk.find_all('p')
        text = ""
        full_speech = ""
        for section in speech:
            text = text + section.get_text()
        full_speech = remove_diacritic(text).decode('utf-8')
        full_speech = full_speech.replace("\n",
                                          " ").replace("--",
                                                       " ").replace("!", " ")
        full_speech = re.sub(r'([ ]{2,})', ' ', full_speech)
        full_speech = re.sub(r'([0-9]{1,4})', ' ', full_speech)
        # Speaker name is set to the full speaker name extracted from the Excel file
        speaker_name = ""

        # Only look at speeches not form the president
        if speaker != "Le President":
            if speaker in speaker_list.index.values:
                for j, name in enumerate(speaker_list.index.values):
                    if speaker == name:
                        speaker_name = speaker_list["FullName"].iloc[j]
            else:
                for i, name in enumerate(speaker_list['LastName']):
                    # Ensures not looking at a list of speakers
                    if (speaker.find(",") == -1) and (speaker.find(" et ")
                                                      == -1):
                        # Looks if speaker name embedded in any names in the Excel file
                        if speaker.find(name) != -1:
                            speaker_name = speaker_list["FullName"].iloc[i]
                            # Adds the speakers_using_find list to do a manual check to ensure that no names are mischaracterized
                            speakers_using_find.add(
                                speaker + " : " + remove_diacritic(
                                    speaker_name).decode('utf-8') + "; " +
                                str(volno) + "; " + str(date) + "\n")
        # Creates the unique speech id
        if speaker_name is not "":
            speaker_name = remove_diacritic(speaker_name).decode('utf-8')
            number_of_speeches = number_of_speeches + 1
            if (speaker_name in speaker_num_total_speeches):
                speaker_num_total_speeches[
                    speaker_name] = speaker_num_total_speeches[speaker_name] + 1
            else:
                speaker_num_total_speeches[speaker_name] = 1
            if (speaker_name in speaker_num_total_chars):
                speaker_num_total_chars[
                    speaker_name] = speaker_num_total_chars[
                        speaker_name] + len(full_speech)
            else:
                speaker_num_total_chars[speaker_name] = len(full_speech)
            if id_base in speakers_per_session:
                speakers_per_session[id_base].add(speaker_name)
            else:
                speakers_per_session[id_base] = set()
                speakers_per_session[id_base].add(speaker_name)
            speakers.add(speaker_name)
            speech_id = "" + id_base + "_" + str(number_of_speeches)
            speechid_to_speaker[speech_id] = speaker_name
            raw_speeches[speech_id] = full_speech
        else:
            names_not_caught.add(speaker + "; " + str(volno) + "; " +
                                 str(date) + "\n")

    speeches_per_day[id_base] = number_of_speeches
def findSpeeches(raw_speeches, multiple_speakers, daily_soup, date, volno):
    id_base = date.replace("/", "_")
    number_of_speeches = 0
    presidents = [
        ">le President", "Le President", "Mle President", "President",
        "le' President", "Le Preesident", "Le Preseident", "Le Presidant",
        "Le Presideait", "le Presiden", "le President", "Le president",
        "le president", "Le President,", "Le Presideut", "Le Presidtent",
        "le Presient", "le Presldent", "le'President"
    ]
    for talk in daily_soup.find_all('sp'):
        # Tries to extract the speaker name and edits it for easier pairing with the Excel file
        try:
            speaker = talk.find('speaker').get_text()
            speaker = remove_diacritic(speaker).decode('utf-8')
            speaker = speaker.replace(".", "").replace(":", "").replace(
                "MM ", "").replace("MM. ", "").replace("M ", "").replace(
                    "de ", "").replace("M. ", "").replace("M, ", "").replace(
                        "M- ", "").replace("M; ", "").replace("M* ", "")
            if speaker.endswith(","):
                speaker = speaker[:-1]
            if speaker.endswith(", "):
                speaker = speaker[:-1]
            if speaker.startswith(' M. '):
                speaker = speaker[3:]
            if speaker.startswith(' '):
                speaker = speaker[1:]
            if speaker.endswith(' '):
                speaker = speaker[:-1]
        except AttributeError:
            speaker = ""

        while talk.find("note"):
            ftnotes = talk.note.extract()

        # Piece together full speech if in multiple paragraph tags
        speech = talk.find_all('p')
        text = ""
        full_speech = ""
        for section in speech:
            text = text + " " + section.get_text()
        full_speech = remove_diacritic(text).decode('utf-8')
        full_speech = re.sub(r'\([0-9]{1,3}\)[\w\W]{1,100}', ' ', full_speech)
        full_speech = full_speech.replace("\n",
                                          " ").replace("--",
                                                       " ").replace("!", " ")
        full_speech = re.sub(r'([ ]{2,})', ' ', full_speech)
        full_speech = re.sub(r'([0-9]{1,4})', ' ', full_speech)
        # Speaker name is set to the full speaker name extracted from the Excel file
        speaker_name = ""

        #####
        # THIS IS THE INITIAL ATTEMPT AT SPEAKER DISAMBIGUATION
        # Only look at speeches not form the president
        if speaker not in presidents:
            if speaker in speaker_list.index.values:
                for j, name in enumerate(speaker_list.index.values):
                    if speaker == name:
                        speaker_name = speaker_list["FullName"].iloc[j]
            else:
                for i, name in enumerate(speaker_list['LastName']):
                    # Ensures not looking at a list of speakers
                    if (speaker.find(",")
                            == -1) and (speaker.find(" et ") != -1):
                        #only store multiple speakers when length of speech greater than 100
                        speaker_name = "multi"
                        if len(full_speech) >= 100:
                            multiple_speakers[speaker] = [
                                full_speech,
                                str(volno), str(date)
                            ]
                    else:
                        # Looks if speaker name embedded in any names in the Excel file
                        if speaker.find(name) != -1:
                            speaker_name = speaker_list["FullName"].iloc[i]
                            # Adds the speakers_using_find list to do a manual check to ensure that no names are mischaracterized
                            speakers_using_find.add(
                                speaker + " : " + remove_diacritic(
                                    speaker_name).decode('utf-8') + "; " +
                                str(volno) + "; " + str(date) + "\n")
        else:
            speaker_name = "president"
        # Creates the unique speech id
        if (speaker_name is not "") and (speaker_name is not "multi") and (
                speaker_name is not "president"):
            speaker_name = remove_diacritic(speaker_name).decode('utf-8')
            number_of_speeches = number_of_speeches + 1
            if (speaker_name in speaker_num_total_speeches):
                speaker_num_total_speeches[
                    speaker_name] = speaker_num_total_speeches[speaker_name] + 1
            else:
                speaker_num_total_speeches[speaker_name] = 1
            if (speaker_name in speaker_num_total_chars):
                speaker_num_total_chars[
                    speaker_name] = speaker_num_total_chars[
                        speaker_name] + len(full_speech)
            else:
                speaker_num_total_chars[speaker_name] = len(full_speech)
            if id_base in speakers_per_session:
                speakers_per_session[id_base].add(speaker_name)
            else:
                speakers_per_session[id_base] = set()
                speakers_per_session[id_base].add(speaker_name)
            speakers.add(speaker_name)
            speech_id = "" + id_base + "_" + str(number_of_speeches)
            speechid_to_speaker[speech_id] = speaker_name
            raw_speeches[speech_id] = full_speech
        else:
            if (speaker_name is not "multi") and (speaker_name
                                                  is not "president"):
                names_not_caught.add(speaker + "; " + str(volno) + "; " +
                                     str(date) + "\n")

    speeches_per_day[id_base] = number_of_speeches
def findSpeeches(raw_speeches, multiple_speakers, daily_soup, date, volno):
    id_base = date.replace("/", "_")
    number_of_speeches = 0
    presidents = [
        ">le President", "Le President", "Mle President", "President",
        "le' President", "Le Preesident", "Le Preseident", "Le Presidant",
        "Le Presideait", "le Presiden", "le President", "Le president",
        "le president", "Le President,", "Le Presideut", "Le Presidtent",
        "le Presient", "le Presldent", "le'President"
    ]
    full_speaker_names = pickle.load(open("dated_names.pickle", "rb"))
    for talk in daily_soup.find_all('sp'):
        # Tries to extract the speaker name and edits it for easier pairing with the Excel file
        try:
            speaker = talk.find('speaker').get_text()
            speaker = remove_diacritic(speaker).decode('utf-8')
            speaker = speaker.replace("M.", "").replace("MM ", "").replace(
                "MM. ", "").replace("M ", "").replace("de ", "").replace(
                    "M. ", "").replace("M, ", "").replace("M- ", "").replace(
                        "M; ", "").replace("M* ", "").replace(".", "").replace(
                            ":", "").replace("-", " ")
            if speaker.endswith(","):
                speaker = speaker[:-1]
            if speaker.endswith(", "):
                speaker = speaker[:-1]
            if speaker.startswith(' M. '):
                speaker = speaker[3:]
            if speaker.startswith(' '):
                speaker = speaker[1:]
            if speaker.endswith(' '):
                speaker = speaker[:-1]
        except AttributeError:
            speaker = ""

        speaker = speaker.lower()

        # Removes the footnotes
        while talk.find("note"):
            ftnotes = talk.note.extract()

        # Piece together full speech if in multiple paragraph tags
        speech = talk.find_all('p')
        text = ""
        full_speech = ""
        parano = 0
        speaker_note = ""
        for section in speech:
            # Find information in parathenses, generally has the department name
            if parano == 0:
                para = section.get_text()
                if len(para) > 1:
                    if para[0] == "(" or para[1] == "(":
                        speaker_notes = re.findall(r'\([\s\S]{0,300}\)', para)
                        if speaker_notes:
                            speaker_note = speaker_notes[0]
                        else:
                            speaker_note = ""
            text = text + " " + section.get_text()
            parano += 1
        full_speech = remove_diacritic(text).decode('utf-8')
        full_speech = re.sub(r'\([0-9]{1,3}\)[\w\W]{1,100}', ' ', full_speech)
        full_speech = full_speech.replace("\n",
                                          " ").replace("--",
                                                       " ").replace("!", " ")
        full_speech = re.sub(r'([ ]{2,})', ' ', full_speech)
        full_speech = re.sub(r'([0-9]{1,4})', ' ', full_speech)

        # Conduct name_disambiguation
        full_speaker_names = read_names("APnames.xlsx")
        # full_speaker_names = pickle.load(open("dated_names.pickle", "rb"))
        if (speaker.find(",") != -1) and (speaker.find(" et ") != -1):
            #only store multiple speakers when length of speech greater than 100
            speaker_name = "multi"
            if len(full_speech) >= 100:
                multiple_speakers[speaker] = [
                    full_speech, str(volno),
                    str(date)
                ]
        elif (speaker.find(" et ") != -1):
            speaker_name = "multi"
            if len(full_speech) >= 100:
                multiple_speakers[speaker] = [
                    full_speech, str(volno),
                    str(date)
                ]
        else:
            # Check to make sure have not already tried to disambiguate that speaker
            if speaker not in speakers_seen:
                matches = compute_speaker_Levenshtein_distance(
                    speaker, full_speaker_names)
                speaker_dists.append(
                    [speaker, matches, volno, date, speaker_note])
                for full_speaker in matches:
                    speaker_dists_split.append([
                        speaker, full_speaker[0], full_speaker[1], volno, date,
                        speaker_note
                    ])
        speakers_seen.add(speaker)
Ejemplo n.º 28
0
def findSpeeches(raw_speeches, multiple_speakers, daily_soup, date, volno):
    id_base = date.replace("/", "_")
    number_of_speeches = 0
    presidents = [
        ">le President", "Le President", "Mle President", "President",
        "le' President", "Le Preesident", "Le Preseident", "Le Presidant",
        "Le Presideait", "le Presiden", "le President", "Le president",
        "le president", "Le President,", "Le Presideut", "Le Presidtent",
        "le Presient", "le Presldent", "le'President"
    ]
    for talk in daily_soup.find_all('sp'):
        # Tries to extract the speaker name and edits it for easier pairing with the Excel file
        try:
            speaker = talk.find('speaker').get_text()
            speaker = remove_diacritic(speaker).decode('utf-8')
            speaker = speaker.replace(".", "").replace(":", "").replace(
                "-", " ").replace("MM ", "").replace("MM. ", "").replace(
                    "M ", "").replace("de ", "").replace("M. ", "").replace(
                        "M, ", "").replace("M- ",
                                           "").replace("M; ",
                                                       "").replace("M* ", "")
            if speaker.endswith(","):
                speaker = speaker[:-1]
            if speaker.endswith(", "):
                speaker = speaker[:-1]
            if speaker.startswith(' M. '):
                speaker = speaker[3:]
            if speaker.startswith(' '):
                speaker = speaker[1:]
            if speaker.endswith(' '):
                speaker = speaker[:-1]
        except AttributeError:
            speaker = ""

        speaker = speaker.lower()

        # Removes the footnotes
        if talk.find("note"):
            ftnotes = talk.note.extract()
            ftnotes = remove_diacritic(ftnotes.get_text()).decode('utf-8')
            ftnotes = ftnotes.replace("\n", "").replace("\r", "").replace(
                "\t", "").replace("  ", " ")
            speech_id = "" + id_base + "_" + str(number_of_speeches + 1)
            footnotes.append([ftnotes, speaker, speech_id, volno])

        if (speaker.find(",") != -1) and (speaker.find(" et ") != -1):
            #only store multiple speakers when length of speech greater than 100
            speaker_name = "multi"
            if len(full_speech) >= 100:
                multiple_speakers[speaker] = [
                    full_speech, str(volno),
                    str(date)
                ]
        elif (speaker.find(" et ") != -1):
            speaker_name = "multi"
            if len(full_speech) >= 100:
                multiple_speakers[speaker] = [
                    full_speech, str(volno),
                    str(date)
                ]
        else:

            if speaker not in speakers_seen:
                matches = compute_speaker_Levenshtein_distance(speaker)
                speaker_dists.append([speaker, matches, volno, date])
                for full_speaker in matches:
                    speaker_dists_split.append([
                        speaker, full_speaker[0], full_speaker[1], volno, date
                    ])
        speakers_seen.add(speaker)

        # if speaker not in speaker_dists:
        # 	speaker_dists[speaker] = compute_speaker_Levenshtein_distance(speaker)

        # speakers_not_matched = []

        # if speaker not in speaker_dists:
        # 	speaker_distances = compute_speaker_Levenshtein_distance(speaker)
        # 	# Need to look at only the top two and if less than or equal to 1 distance keep it, otherwise say not found
        # 	if speaker_distances[0][1] <= 1:
        # 		speaker = speaker_distances[0][0]
        # 	else:
        # 		speaker_dists[speaker] = speaker_distances

        # Piece together full speech if in multiple paragraph tags
        speech = talk.find_all('p')
        text = ""
        full_speech = ""
        for section in speech:
            text = text + " " + section.get_text()
        full_speech = remove_diacritic(text).decode('utf-8')
        full_speech = re.sub(r'\([0-9]{1,3}\)[\w\W]{1,100}', ' ', full_speech)
        full_speech = full_speech.replace("\n",
                                          " ").replace("--",
                                                       " ").replace("!", " ")
        full_speech = re.sub(r'([ ]{2,})', ' ', full_speech)
        full_speech = re.sub(r'([0-9]{1,4})', ' ', full_speech)
        # Speaker name is set to the full speaker name extracted from the Excel file
        speaker_name = ""

        # Only look at speeches not form the president
        if speaker not in presidents:
            if speaker in speaker_list.index.values:
                for j, name in enumerate(speaker_list.index.values):
                    if speaker == name:
                        speaker_name = speaker_list["FullName"].iloc[j]
            else:
                for i, name in enumerate(speaker_list['LastName']):
                    # Ensures not looking at a list of speakers
                    if (speaker.find(",") != -1) and (speaker.find(" et ") !=
                                                      -1):
                        #only store multiple speakers when length of speech greater than 100
                        speaker_name = "multi"
                        if len(full_speech) >= 100:
                            multiple_speakers[speaker] = [
                                full_speech,
                                str(volno), str(date)
                            ]
                    else:
                        # Looks if speaker name embedded in any names in the Excel file
                        if speaker.find(name) != -1:
                            speaker_name = speaker_list["FullName"].iloc[i]
                            # Adds the speakers_using_find list to do a manual check to ensure that no names are mischaracterized
                            speakers_using_find.add(
                                speaker + " : " + remove_diacritic(
                                    speaker_name).decode('utf-8') + "; " +
                                str(volno) + "; " + str(date) + "\n")
        else:
            speaker_name = "president"
        # Creates the unique speech id
        if (speaker_name is not "") and (speaker_name is not "multi") and (
                speaker_name is not "president"):
            speaker_name = remove_diacritic(speaker_name).decode('utf-8')
            number_of_speeches = number_of_speeches + 1
            if (speaker_name in speaker_num_total_speeches):
                speaker_num_total_speeches[
                    speaker_name] = speaker_num_total_speeches[speaker_name] + 1
            else:
                speaker_num_total_speeches[speaker_name] = 1
            if (speaker_name in speaker_num_total_chars):
                speaker_num_total_chars[
                    speaker_name] = speaker_num_total_chars[
                        speaker_name] + len(full_speech)
            else:
                speaker_num_total_chars[speaker_name] = len(full_speech)
            if id_base in speakers_per_session:
                speakers_per_session[id_base].add(speaker_name)
            else:
                speakers_per_session[id_base] = set()
                speakers_per_session[id_base].add(speaker_name)
            speakers.add(speaker_name)
            speech_id = "" + id_base + "_" + str(number_of_speeches)
            speechid_to_speaker[speech_id] = speaker_name
            raw_speeches[speech_id] = full_speech
        else:
            if (speaker_name is not "multi") and (speaker_name
                                                  is not "president"):
                names_not_caught.add(speaker + "; " + str(volno) + "; " +
                                     str(date) + "\n")

    speeches_per_day[id_base] = number_of_speeches