コード例 #1
0
def firststep():

    byspeaker = {}
    speakerdict = {}

    byspeaker_allspeakers = {}
    speakerdict_allspeakers = {}

    ngrams = {}

    speakers_to_consider = []

    raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))
    # dataframe = pd.DataFrame.from_dict(raw_speeches, orient = "index")
    # dataframe.columns = ['Speeches']
    speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb"))
    # file = open('num_speeches.txt', 'r')
    # num_speeches = int(file.read())
    # doc_freq = pickle.load(open("bigram_doc_freq.pickle", "rb"))
    speakers_to_analyze = load_list("Girondins and Montagnards New Mod.xlsx")

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speechid in raw_speeches:
        fulldate = speechid[0:10]
        if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"):
            speech_bigrams = compute_ngrams(raw_speeches[speechid], 2)

            speaker = speechid_to_speaker[speechid]

            print speaker

            if speaker in byspeaker_allspeakers:
                byspeaker_allspeakers[
                    speaker] = byspeaker_allspeakers[speaker] + speech_bigrams
            else:
                byspeaker_allspeakers[speaker] = speech_bigrams
            speech_bigrams = None

    with open("byspeaker_allspeakers.pickle", "wb") as handle:
        pickle.dump(byspeaker_allspeakers, handle, protocol=0)

    w = csv.writer(open("byspeaker_allspeakers.csv", "w"))
    for key, val in byspeaker.items():
        w.writerow([key, val])
    """byspeaker_allspeakers = pd.DataFrame.from_dict(byspeaker_allspeakers, orient = "index")
コード例 #2
0
def aggregate_by_speaker():

    byspeaker = {}
    speakerdict = {}

    ngrams = {}

    speakers_to_consider = []

    raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))
    speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb"))
    speakers_to_analyze = load_list(
        "Girondins and Montagnards New Mod Limit.xlsx")
    speaker_num_words = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speechid in raw_speeches:
        fulldate = speechid[0:10]
        if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"):
            num_words = len(raw_speeches[speechid].split())
            speech_bigrams = compute_ngrams(raw_speeches[speechid], 2)

            speaker = speechid_to_speaker[speechid]

            if speaker in speaker_num_words:
                speaker_num_words[speaker] += num_words
            else:
                speaker_num_words[speaker] = num_words

            if speaker in speakers_to_consider:
                if speaker in byspeaker:
                    byspeaker[speaker] = byspeaker[speaker] + speech_bigrams
                else:
                    byspeaker[speaker] = speech_bigrams
            speech_bigrams = None

    write_to_csv(byspeaker)
    store_to_pickle(byspeaker)

    write_to_csv(speaker_num_words)
    store_to_pickle(speaker_num_words)
コード例 #3
0
                    speaker_name == speechid_to_speaker[identity]):

                indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)

                for bigram in indv_speech_bigram:
                    if bigram in speaker_bigram_frequencies:
                        #speechid_frequencies = speaker_bigram_frequencies[bigram]
                        #speechid_frequencies[speechid] = indv_speech_bigram[bigram]
                        speaker_bigram_frequencies[bigram][
                            identity] = indv_speech_bigram[bigram]
                    else:
                        speaker_bigram_frequencies[bigram] = {}
                        speaker_bigram_frequencies[bigram][
                            identity] = indv_speech_bigram[bigram]
        filename_pickle = "" + speaker_name + "bigram_frequencies.pickle"
        with open(filename_pickle, 'wb') as handle:
            pickle.dump(speaker_bigram_frequencies, handle, protocol=0)
        filename_csv = "" + speaker_name + "bigram_frequencies.csv"
        w = csv.writer(open(filename_csv, "w"))
        for key, val in speaker_bigram_frequencies.items():
            w.writerow([key, val])


if __name__ == '__main__':
    import sys
    raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))
    speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb"))
    speakers_to_analyze = load_list(
        "Girondins and Montagnards New Mod Limit.xlsx")
    aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker)
コード例 #4
0
def track_murmures_applaudissements(raw_speeches, speechid_to_speaker):
    speakers_to_analyze = load_list(
        "Girondins and Montagnards New Mod Limit.xlsx")
    speakers_to_consider = []
    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))
    murmures = []
    applaudissements = []
    Girondins_murmures = 0
    Montagnards_murmures = 0
    Girondins_applaudissements = 0
    Montagnards_applaudissements = 0
    murmures_by_date = {}
    applaudissements_by_date = {}
    total_murmures = 0
    total_applaudissements = 0
    murmures_speakers = {}
    applaudissements_speakers = {}
    for speechid, speech in raw_speeches.items():
        speaker_name = speechid_to_speaker[speechid]
        if speaker_name in speakers_to_consider:
            date = speechid[0:9]
            party = speakers_to_analyze.loc[speaker_name, "Party"]
            if "murmure" in speech:
                total_murmures += 1
                murmures.append(speechid)
                if party == "Girondins":
                    Girondins_murmures += 1
                else:
                    Montagnards_murmures += 1
                if date in murmures_by_date:
                    murmures_by_date[date] += 1
                else:
                    murmures_by_date[date] = 0
                if speaker_name in murmures_speakers:
                    murmures_speakers[speaker_name] += 1
                else:
                    murmures_speakers[speaker_name] = 0
            if "applaudissement" in speech:
                total_applaudissements += 1
                applaudissements.append(speechid)
                if party == "Girondins":
                    Girondins_applaudissements += 1
                else:
                    Montagnards_applaudissements += 1
                if date in applaudissements_by_date:
                    applaudissements_by_date[date] += 1
                else:
                    applaudissements_by_date[date] = 0
                if speaker_name in applaudissements_speakers:
                    applaudissements_speakers[speaker_name] += 1
                else:
                    applaudissements_speakers[speaker_name] = 0
        else:
            if "murmure" in speech:
                total_murmures += 1
            if "applaudissement" in speech:
                total_applaudissements += 1

    with open('gir_murmures.txt', 'w') as f:
        f.write('%d' % Girondins_murmures)
    with open('mont_murmures.txt', 'w') as f:
        f.write('%d' % Montagnards_murmures)
    print Montagnards_murmures + Girondins_murmures

    with open('total_murmures.txt', 'w') as f:
        f.write('%d' % total_murmures)
    with open('total_applaudissements.txt', 'w') as f:
        f.write('%d' % total_applaudissements)

    with open('gir_applaudissements.txt', 'w') as f:
        f.write('%d' % Girondins_applaudissements)
    with open('mont_applaudissements.txt', 'w') as f:
        f.write('%d' % Montagnards_applaudissements)
    print Montagnards_applaudissements + Girondins_applaudissements

    with open('murmures_by_date.pickle', 'wb') as handle:
        pickle.dump(murmures_by_date, handle, protocol=0)

    with open('applaudissements_by_date.pickle', 'wb') as handle:
        pickle.dump(applaudissements_by_date, handle, protocol=0)

    w = csv.writer(open("murmures_by_date.csv", "w"))
    for key, val in murmures_by_date.items():
        w.writerow([key, val])

    w = csv.writer(open("applaudissements_by_date.csv", "w"))
    for key, val in applaudissements_by_date.items():
        w.writerow([key, val])

    w = csv.writer(open("murmures_speakers.csv", "w"))
    for key, val in murmures_speakers.items():
        w.writerow([key, val])

    w = csv.writer(open("applaudissements_speakers.csv", "w"))
    for key, val in applaudissements_speakers.items():
        w.writerow([key, val])
コード例 #5
0
def compute_distances(dataframe, period, gir_dict, mont_dict, gir_mont_diff):
    period_vector = []
    if (period == 'aggregation') or (period == 'speaker'):
        period_vector = list(dataframe.keys())
        period_vector = pd.Series(period_vector)
        speaker_num_speeches = pickle.load(
            open("speaker_num_speeches.pickle", "rb"))
        """period_vector = pd.Series(period_vector)
		tfidf_scores = dataframe['tfidf'].tolist()"""
    else:
        periods = ["Before convention", "Convention", "After convention"]
        period_vector = pd.Series(periods)
        # This assumes that tfidf_scores for the periods is a list not a pandas dataframe

    gir_dist = []
    mont_dist = []
    gir_mont_diff_dist = []
    # This for loop is contingent on tfidf_scores being a list
    for element in dataframe:
        """print type(element)
		print type(dataframe[element])
		to_compare = dataframe[element]"""
        if period == 'speaker':
            #gir = pickle.load(open("Girondins.pickle", "rb"))
            #mont = pickle.load(open("Montagnards.pickle", "rb"))

            # Consider dividing by number of speeches, to normalize
            # Maintain num of speeches per group and number of chars per group

            gir = pickle.load(open("Girondins.pickle", "rb"))
            mont = pickle.load(open("Montagnards.pickle", "rb"))

            file = open('gir_speeches_noplein.txt', 'r')
            gir_num_speeches = int(file.read())

            file = open('mont_speeches_noplein.txt', 'r')
            mont_num_speeches = int(file.read())

            speakers_to_analyze = load_list(
                "Girondins and Montagnards New Mod.xlsx")
            party = speakers_to_analyze.loc[element, "Party"]

            print element
            print party
            print type(dataframe[element])

            if party == 'Girondins':
                gir = gir - dataframe[element]
            if party == 'Montagnards':
                mont = mont - dataframe[element]

            # Normalizing by number of speeches
            #gir_normalized = normalize_by_speeches(gir, gir_num_speeches)
            #mont_normalized = normalize_by_speeches(mont, mont_num_speeches)

            gir_dict = convert_keys_to_string(
                compute_tfidf(gir, num_speeches, doc_freq))
            mont_dict = convert_keys_to_string(
                compute_tfidf(mont, num_speeches, doc_freq))
            gir_mont_diff = compute_difference_withplein(gir_dict, mont_dict)

            # Resets the Gir and Mont vectors to their unnormalized version
            #gir_dict_unnormalized = convert_keys_to_string(compute_tfidf(gir, num_speeches, doc_freq))
            #mont_dict_unnormalized = convert_keys_to_string(compute_tfidf(mont, num_speeches, doc_freq))

            w = csv.writer(open("gir_mont_diff.csv", "w"))
            for key, val in gir_mont_diff.items():
                w.writerow([key, val])

            # Normalizing the speaker data as well
            #speaker_speeches = speaker_num_speeches[element]
            #speaker_dict = normalize_by_speeches(dataframe[element], speaker_speeches)

            speaker_dict = dataframe[element]

            tfidf_speaker = compute_tfidf(speaker_dict, num_speeches, doc_freq)

            to_compare = convert_keys_to_string(tfidf_speaker)
        elif period == 'aggregation':
            to_compare = convert_keys_to_string(dataframe[element])
        else:
            to_compare = convert_keys_to_string(element)
        # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance
        # This was particularly relevant as there was a speaker with tfidf_scores of length 0
        if len(to_compare) > 0:
            #Normalized
            gir_dist.append(1 - cosine_similarity(gir_dict, to_compare))
            mont_dist.append(1 - cosine_similarity(mont_dict, to_compare))

            #Unnormalized
            #gir_dist.append(1 - cosine_similarity(gir_dict_unnormalized, to_compare))
            #mont_dist.append(1 - cosine_similarity(mont_dict_unnormalized, to_compare))

            gir_mont_diff_dist.append(
                cosine_similarity(gir_mont_diff, to_compare))
        else:
            gir_dist.append(1)
            mont_dist.append(1)
            gir_mont_diff_dist.append(0)

    # Merges the distance lists and creates a comprehensive dataframe to return
    mont_dist = pd.Series(mont_dist)
    gir_dist = pd.Series(gir_dist)
    gir_mont_diff_dist = pd.Series(gir_mont_diff_dist)
    comp_df = pd.DataFrame(
        [period_vector, gir_dist, mont_dist, gir_mont_diff_dist])
    comp_df = comp_df.transpose()
    comp_df.columns = [
        period, 'distance to gir', 'distance to mont', 'distance to diff'
    ]
    return comp_df
コード例 #6
0
def compute_distances(dataframe, period, gir_dict, mont_dict, plein_dict,
                      gir_mont_diff):
    period_vector = []
    if (period == 'aggregation') or (period == 'speaker'):
        period_vector = list(dataframe.keys())
        period_vector = pd.Series(period_vector)
        """period_vector = pd.Series(period_vector)
		tfidf_scores = dataframe['tfidf'].tolist()"""
    else:
        periods = ["Before convention", "Convention", "After convention"]
        period_vector = pd.Series(periods)
        # This assumes that tfidf_scores for the periods is a list not a pandas dataframe

    gir_dist = []
    mont_dist = []
    plein_dist = []
    gir_mont_diff_dist = []
    # This for loop is contingent on tfidf_scores being a list
    for element in dataframe:
        """print type(element)
		print type(dataframe[element])
		to_compare = dataframe[element]"""
        print element
        if period == 'speaker':
            #gir = pickle.load(open("Girondins.pickle", "rb"))
            #mont = pickle.load(open("Montagnards.pickle", "rb"))
            gir = pickle.load(open("Girondins_withplein.pickle", "rb"))
            mont = pickle.load(open("Montagnards_withplein.pickle", "rb"))
            speakers_to_analyze = load_list(
                "Girondins and Montagnards New Mod.xlsx")
            party = speakers_to_analyze.loc[element, "Party"]
            if party == 'Girondins':
                gir = gir - dataframe[element]
            if party == 'Montagnards':
                print "here"
                mont = mont - dataframe[element]
            gir_dict = convert_keys_to_string(
                compute_tfidf(gir, num_speeches, doc_freq))
            mont_dict = convert_keys_to_string(
                compute_tfidf(mont, num_speeches, doc_freq))
            gir_mont_diff = compute_difference_withplein(gir_dict, mont_dict)
            tfidf_speaker = compute_tfidf(dataframe[element], num_speeches,
                                          doc_freq)
            to_compare = convert_keys_to_string(tfidf_speaker)
        elif period == 'aggregation':
            to_compare = convert_keys_to_string(dataframe[element])
        else:
            to_compare = convert_keys_to_string(element)
        # Checks if there tfidf_scores vector exists. If it doesn't, default values are assigned for the distance
        # This was particularly relevant as there was a speaker with tfidf_scores of length 0
        if len(to_compare) > 0:
            gir_dist.append(1 - cosine_similarity(gir_dict, to_compare))
            mont_dist.append(1 - cosine_similarity(mont_dict, to_compare))
            plein_dist.append(1 - cosine_similarity(plein_dict, to_compare))
            gir_mont_diff_dist.append(
                cosine_similarity(gir_mont_diff, to_compare))
        else:
            gir_dist.append(1)
            mont_dist.append(1)
            plein_dist.append(1)
            gir_mont_diff_dist.append(0)

    # Merges the distance lists and creates a comprehensive dataframe to return
    gir_dist = pd.Series(gir_dist)
    mont_dist = pd.Series(mont_dist)
    plein_dist = pd.Series(plein_dist)
    gir_mont_diff_dist = pd.Series(gir_mont_diff_dist)
    comp_df = pd.DataFrame(
        [period_vector, gir_dist, mont_dist, gir_mont_diff_dist, plein_dist])
    comp_df = comp_df.transpose()
    comp_df.columns = [
        period, 'distance to gir', 'distance to mont', 'distance to diff',
        'distance to plein'
    ]
    return comp_df