Ejemplo n.º 1
0
def distance_analysis():

    gir_tfidf = process_excel('girondins_tfidf_allbigrams.xlsx')
    mont_tfidf = process_excel("montagnards_tfidf_allbigrams.xlsx")

    gir_dict = convert_keys_to_string(gir_tfidf)
    mont_dict = convert_keys_to_string(mont_tfidf)
    gir_mont_diff = compute_difference(gir_dict, mont_dict)
    print "here2"

    #by_month = pickle.load(open("byyearmonth.pickle", "rb"))
    #by_date = pickle.load(open("byfulldate.pickle", "rb"))
    by_speaker = pickle.load(open("byspeaker.pickle", "rb"))
    #by_speaker_allspeakers = pickle.load(open("byspeaker_allspeakers.pickle", "rb"))
    """by_month = create_tfidf_vectors(by_month)
	by_month_dist = compute_distances(by_month, 'aggregation',  gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_month_dist, 'by_month_distances.xlsx')

	by_period = aggregate_by_period(by_date)
	by_date = create_tfidf_vectors(by_date)

	by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_period_dist, "by_period_distances.xlsx")

	by_date_dist = compute_distances(by_date, 'aggregation',  gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_date_dist, 'by_date_distances.xlsx')"""

    #by_speaker = create_tfidf_vectors(by_speaker)
    by_speaker_dist = compute_distances(by_speaker, 'speaker', gir_dict,
                                        mont_dict, gir_mont_diff)
    write_to_excel(by_speaker_dist,
                   'by_speaker_noplein_distances_speaker_withsub.xlsx')
    """by_speaker_allspeakers = create_tfidf_vectors(by_speaker_allspeakers)
def distance_analysis():

    #When doing anything but speakers need to change this so accounts for introduction of plein
    gir_tfidf = process_excel('girondins_tfidf_allbigrams.xlsx')
    mont_tfidf = process_excel("montagnards_tfidf_allbigrams.xlsx")
    plein_tfidf = process_excel("plein_tfidf.xlsx")

    gir_dict = convert_keys_to_string(gir_tfidf)
    mont_dict = convert_keys_to_string(mont_tfidf)
    plein_dict = convert_keys_to_string(plein_tfidf)
    gir_mont_diff = compute_difference(gir_dict, mont_dict)

    #by_month = pickle.load(open("byyearmonth.pickle", "rb"))
    #by_date = pickle.load(open("byfulldate.pickle", "rb"))
    by_speaker = pickle.load(open("byspeaker.pickle", "rb"))
    #by_speaker_allspeakers = pickle.load(open("byspeaker_allspeakers.pickle", "rb"))
    """by_month = create_tfidf_vectors(by_month)
	by_month_dist = compute_distances(by_month, 'aggregation',  gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_month_dist, 'by_month_distances.xlsx')

	by_period = aggregate_by_period(by_date)
	by_date = create_tfidf_vectors(by_date)

	by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_period_dist, "by_period_distances.xlsx")

	by_date_dist = compute_distances(by_date, 'aggregation',  gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_date_dist, 'by_date_distances.xlsx')"""

    #by_speaker = create_tfidf_vectors(by_speaker)
    by_speaker_dist = compute_distances(by_speaker, 'speaker', gir_dict,
                                        mont_dict, plein_dict, gir_mont_diff)
    write_to_excel(by_speaker_dist, 'by_speaker_distances.xlsx')
    """by_speaker_allspeakers = create_tfidf_vectors(by_speaker_allspeakers)
def distance_analysis():


	#by_month = pickle.load(open("byyearmonth.pickle", "rb"))
	#by_date = pickle.load(open("byfulldate.pickle", "rb"))
	by_speaker = pickle.load(open("byspeaker.pickle", "rb"))
	#by_speaker_allspeakers = pickle.load(open("byspeaker_allspeakers.pickle", "rb"))

	"""by_month = create_tfidf_vectors(by_month)
	by_month_dist = compute_distances(by_month, 'aggregation',  gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_month_dist, 'by_month_distances.xlsx')

	by_period = aggregate_by_period(by_date)
	by_date = create_tfidf_vectors(by_date)

	by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_period_dist, "by_period_distances.xlsx")

	by_date_dist = compute_distances(by_date, 'aggregation',  gir_dict, mont_dict, plein_dict, gir_mont_diff)
	write_to_excel(by_date_dist, 'by_date_distances.xlsx')"""

	#by_speaker = create_tfidf_vectors(by_speaker)
	by_speaker_dist = compute_distances(by_speaker)
	write_to_excel(by_speaker_dist, 'by_speaker_noplein_distances_withlimit_withsub.xlsx')

	"""by_speaker_allspeakers = create_tfidf_vectors(by_speaker_allspeakers)
Ejemplo n.º 4
0
def distance_analysis():

    gir_tfidf = process_excel('girondins_tfidf_allbigrams.xlsx')
    mont_tfidf = process_excel("montagnards_tfidf_allbigrams.xlsx")

    gir_dict = convert_keys_to_string(gir_tfidf)
    mont_dict = convert_keys_to_string(mont_tfidf)
    gir_mont_diff = compute_difference(gir_dict, mont_dict)

    by_month = pd.read_excel("By_Month.xlsx")
    by_date = pd.read_excel("By_Date.xlsx")
    by_speaker = pd.read_excel("By_Speaker_Convention.xlsx")
    """by_month = create_tfidf_vectors(by_month)
	by_month_dist = compute_distances(by_month, 'month',  gir_dict, mont_dict, gir_mont_diff)
	write_to_excel(by_month_dist, 'by_month_distances.xlsx')

	by_date = create_tfidf_vectors(by_date)
	by_period = aggregate_by_period(by_date)

	by_period_dist = compute_distances(by_period, 'period', gir_dict, mont_dict, gir_mont_diff)
	write_to_excel(by_period_dist, "by_period_distances.xlsx")

	by_date_dist = compute_distances(by_date, 'date',  gir_dict, mont_dict, gir_mont_diff)
	write_to_excel(by_date_dist, 'by_date_distances.xlsx')"""

    by_speaker = create_tfidf_vectors(by_speaker)
    by_speaker_dist = compute_distances(by_speaker, 'speaker', gir_dict,
                                        mont_dict, gir_mont_diff)
    write_to_excel(by_speaker_dist, 'by_speaker_distances.xlsx')
Ejemplo n.º 5
0
def calculate_chronology(raw_speeches, speechid_to_speaker, speaker_list,
                         speakers_to_analyze, Girondins, Montagnards):
    speaker_ngrams = {}
    speakers_to_consider = []
    speaker_distances = collections.defaultdict()
    chronology = collections.defaultdict(dict)

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    row_entry_speechid = []
    row_entry_date = []
    for identity in raw_speeches:
        date = re.findall(date_regex, str(identity))[0]
        speaker_name = speechid_to_speaker[identity]
        if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                speaker_name in speakers_to_consider):
            party = speakers_to_analyze.loc[speaker_name, "Party"]
            indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
            # Store relevant information for each bigram
            for bigram in indv_speech_bigram:
                row_entry_speechid.append([
                    str(bigram), speaker_name, identity,
                    indv_speech_bigram[bigram], party
                ])
                row_entry_date.append([
                    str(bigram), speaker_name, date,
                    indv_speech_bigram[bigram], party
                ])

    chronology_speechid = pd.DataFrame(row_entry_speechid,
                                       columns=[
                                           "Bigram", "Speaker Name",
                                           "Speechid", "Num occurrences",
                                           "Party"
                                       ])
    chronology_date = pd.DataFrame(
        row_entry_date,
        columns=["Bigram", "Speaker Name", "Date", "Num occurrences", "Party"])

    # w = csv.writer(open("chronology.csv", "w"))
    # for key, val in chronology.items():
    # 	if (Girondins[key] >= 10) or (Montagnards[key] >= 10):
    # 		w.writerow([key,val])
    make_visualizations(chronology_date)

    write_to_excel(chronology_speechid, "chronology_speechid.xlsx")
    write_to_excel(chronology_date, "chronology_date.xlsx")

    store_to_pickle(chronology_speechid, "chronology_speechid.pickle")
    store_to_pickle(chronology_date, "chronology_date.pickle")
Ejemplo n.º 6
0
def groupby_date(df, ngrams):
	bydate_dict = {}

	for i, speechid in enumerate(df['Speechid']):
		date = df['Full Date'].iloc[i]
		dict_ngrams = ngrams[speechid]
		if date in bydate_dict:
			bydate_dict[date] = byydate_dict[date] + dict_ngrams
		else:
			bydate_dict[date] = dict_ngrams

	bydate = pd.DataFrame.from_dict(bydate_dict, orient = "index")

	write_to_excel(bydate, "bydate.xlsx")
	with open("bydate.pickle", "wb") as handle:
		pickle.dump(bydate, handle, protocol = 0)
Ejemplo n.º 7
0
def groupby_speaker(df, ngrams):
	byspeaker_dict = {}

	for i, speechid in enumerate(df['Speechid']):
		speaker = df['Speaker'].iloc[i]
		dict_ngrams = ngrams[speechid]
		if speaker in byspeaker_dict:
			byspeaker_dict[speaker] = byspeaker_dict[speaker] + dict_ngrams
		else:
			byspeaker_dict[speaker] = dict_ngrams

	byspeaker = pd.DataFrame.from_dict(byspeaker_dict, orient = "index")

	write_to_excel(byspeaker, "byspeaker.xlsx")
	with open("byspeaker.pickle", "wb") as handle:
		pickle.dump(byspeaker, handle, protocol = 0)
Ejemplo n.º 8
0
def groupby_yearmonth(df, ngrams):
	byyearmonth_dict = {}

	for i, speechid in enumerate(df['Speechid']):
		yearmonth = df['Year-Month'].iloc[i]
		dict_ngrams = ngrams[speechid]
		if yearmonth in byyearmonth_dict:
			byyearmonth_dict[yearmonth] = byyearmonth_dict[yearmonth] + dict_ngrams
		else:
			byyearmonth_dict[yearmonth] = dict_ngrams

	byyearmonth = pd.DataFrame.from_dict(byyearmonth_dict, orient = "index")

	write_to_excel(byyearmonth, "byyearmonth.xlsx")
	with open("byyearmonth.pickle", "wb") as handle:
		pickle.dump(byyearmonth, handle, protocol = 0)
def counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq):
	
	# Computes the tfidf scores within each group
	gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
	mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

	store_to_pickle(gir_tfidf, "gir_tfidf.pickle")
	store_to_pickle(mont_tfidf, "mont_tfidf.pickle")

	# Stores the tf_idf vectors in Excel
	df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index")
	write_to_excel(df_gir_tfidf, 'gir_tfidf.xlsx')

	df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index")
	write_to_excel(df_mont_tfidf, 'mont_tfidf.xlsx')

	# Combines the tfidf vectors of both parties into one file
	df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
	df_tfidf_combined = df_tfidf_combined.transpose()
	df_tfidf_combined.columns = ["Girondins", "Montagnards"]
	write_to_excel(df_tfidf_combined, 'combined_tfidf.xlsx')

	# Limits based on v, or the number of times that bigram appears, and gir or mont docs, the number of 
	# speakers in each group that use that bigram
	# Can change the name of these dataframes to illuminate what the restrictions are
	Girondins_restricted = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)}
	Montagnards_restricted = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)}

	store_to_pickle(Girondins_restricted, "Girondins_restricted.pickle")
	store_to_pickle(Montagnards_restricted, "Montagnards_restricted.pickle")

	gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
	mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

	# Stores the Girondins and Montagnards frequency vectors and tfidfs in the same document according to restrictions
	df_combined = pd.DataFrame([Girondins, Montagnards])
	df_combined = df_combined.transpose()
	df_combined.columns = ["Girondins", "Montagnards"]
	write_to_excel(df_combined, 'combined_frequency_restricted.xlsx')

	df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
	df_tfidf_combined = df_tfidf_combined.transpose()
	df_tfidf_combined.columns = ["Girondins", "Montagnards"]
	write_to_excel(df_tfidf_combined, 'combined_tfidf_restricted.xlsx')
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker,
              Girondins, Montagnards, Plein):
    speaker_names = set()
    speaker_num_speeches = {}
    speaker_char_count = {}
    speakers_to_consider = []

    bigrams_to_speeches = {}
    bigrams_to_speakers = {}
    bigram_doc_freq = collections.defaultdict()

    gir_num_speeches = 0
    mont_num_speeches = 0
    gir_docs = {}
    mont_docs = {}
    plein_docs = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speaker_name in speakers_to_consider:
        print speaker_name
        party = speakers_to_analyze.loc[speaker_name, "Party"]
        speech = Counter()
        for identity in raw_speeches:
            date = re.findall(date_regex, str(identity))[0]
            if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                    speaker_name == speechid_to_speaker[identity]):
                # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker
                # To potentially establish a cutoff for analysis purposes
                augment(speaker_num_speeches, speaker_name)
                if speaker_name in speaker_char_count:
                    speaker_char_count[speaker_name] += len(
                        raw_speeches[identity])
                else:
                    speaker_char_count[speaker_name] = len(
                        raw_speeches[identity])

                indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)

                for bigram in indv_speech_bigram:
                    augment(bigram_doc_freq, bigram)

                    # Maintains a list of speeches in which given bigrams are spoken in
                    if bigram in bigrams_to_speeches:
                        bigrams_to_speeches[bigram].append(identity)
                    else:
                        bigrams_to_speeches[bigram] = []
                        bigrams_to_speeches[bigram].append(identity)
                    if bigram in bigrams_to_speakers:
                        bigrams_to_speakers[bigram].add(speaker_name)
                    else:
                        bigrams_to_speakers[bigram] = set()
                        bigrams_to_speakers[bigram].add(speaker_name)

                # Augments the relevant variables according to the party the speaker belongs to
                if party == "Girondins":
                    gir_num_speeches += 1
                    gir_docs = check_num_speakers(indv_speech_bigram,
                                                  speaker_name, gir_docs)
                    try:
                        Girondins = Girondins + indv_speech_bigram
                    except NameError:
                        Girondins = indv_speech_bigram
                else:
                    mont_num_speeches += 1
                    mont_docs = check_num_speakers(indv_speech_bigram,
                                                   speaker_name, mont_docs)
                    try:
                        Montagnards = Montagnards + indv_speech_bigram
                    except NameError:
                        Montagnards = indv_speech_bigram

                #speech = speech + indv_speech_bigram

    # 	# Stores the bigram Counter object for each individual speaker
    # 	"""pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
    # 	with open(pickle_filename, 'wb') as handle:
    # 		pickle.dump(speech, handle, protocol = 0)"""

    # Stores the bigrams_to_speeches document in Excel
    df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches,
                                                    orient="index")
    write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')
    pickle_filename = "bigrams_to_speakers.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(bigrams_to_speakers, handle, protocol=0)

    pickle_filename = "bigrams_to_speeches.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(bigrams_to_speeches, handle, protocol=0)

    pickle_filename = "bigrams_to_speeches.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(bigrams_to_speeches, handle, protocol=0)

    pickle_filename = "gir_docs.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(gir_docs, handle, protocol=0)

    pickle_filename = "mont_docs.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(mont_docs, handle, protocol=0)

    # bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb"))
    # bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb"))

    # gir_docs = pickle.load(open("gir_docs.pickle", "rb"))
    # mont_docs = pickle.load(open("mont_docs.pickle", "rb"))

    # Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb"))
    # Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb"))

    bigram_num_speakers = []
    bigram_num_speeches = []
    bigram_total_freq = []
    bg_speeches = {}
    bigrams = []
    speeches = []
    speakers = []
    for bigram in bigrams_to_speeches:
        if (Girondins[bigram] >= 10) or (Montagnards[bigram] >= 10):
            bigram_num_speakers.append(len(bigrams_to_speakers[bigram]))
            bigram_num_speeches.append(len(bigrams_to_speeches[bigram]))
            bigram_total_freq.append(Girondins[bigram] + Montagnards[bigram])
            bigrams.append(str(bigram))
            speeches.append(str(bigrams_to_speeches[bigram]))
            speakers.append(str(bigrams_to_speakers[bigram]))

    bg_num_speakers = pd.DataFrame(bigram_num_speakers,
                                   columns=['Num Speakers'])
    bg_num_speeches = pd.DataFrame(bigram_num_speeches,
                                   columns=['Num Speeches'])
    bg_total_freq = pd.DataFrame(bigram_total_freq, columns=['Total count'])
    bgs = pd.DataFrame(bigrams, columns=["Bigram"])
    speech = pd.DataFrame(speeches, columns=["Speechids"])
    speaker = pd.DataFrame(speakers, columns=["Speakers"])

    bigram_info = pd.DataFrame()
    bigram_info = pd.concat([
        bgs, speech, speaker, bg_num_speeches, bg_num_speakers, bg_total_freq
    ],
                            axis=1)
    writer = pd.ExcelWriter("bigram_info.xlsx")
    bigram_info.to_excel(writer, 'Sheet1')
    writer.save()

    w = csv.writer(open("bigrams_to_speeches_noplein.csv", "w"))
    for key, val in bigrams_to_speeches.items():
        w.writerow([key, val])

    bigrams_to_speakers_noplein_sorted = sorted(bigrams_to_speakers.items(),
                                                key=lambda x: len(x[1]),
                                                reverse=True)
    w = csv.writer(open("bigrams_to_speakers_noplein_sorted.csv", "w"))
    for item in bigrams_to_speakers_noplein_sorted:
        w.writerow([item[0], item[1]])

    # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors
    # num_speeches = 4479
    # bigram_doc_freq = pickle.load(open("bigram_doc_freq_noplein_withlimit.pickle", 'rb'))

    with open('gir_speeches_noplein_withlimit.txt', 'w') as f:
        f.write('%d' % gir_num_speeches)
    with open('mont_speeches_noplein_withlimit.txt', 'w') as f:
        f.write('%d' % mont_num_speeches)
    print num_speeches

    with open('speaker_num_speeches_withlimit.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    with open('speaker_char_count_withlimit.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    w = csv.writer(open("speaker_num_speeches_withlimit.csv", "w"))
    for key, val in speaker_num_speeches.items():
        w.writerow([key, val])

    w = csv.writer(open("speaker_char_count_withlimit.csv", "w"))
    for key, val in speaker_char_count.items():
        w.writerow([key, val])

    # Write the number of speeches and doc_frequency to memory for use in further analysis
    with open('num_speeches_noplein_withlimit.txt', 'w') as f:
        f.write('%d' % num_speeches)
    df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index")
    write_to_excel(df_doc_freq, 'doc_freq.xlsx')

    with open("bigram_doc_freq_noplein_withlimit.pickle", 'wb') as handle:
        pickle.dump(bigram_doc_freq, handle, protocol=0)

    # # Girondins = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)}
    # # Montagnards = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)}

    # with open("Girondins_withlimit.pickle", 'wb') as handle:
    # 	pickle.dump(Girondins, handle, protocol = 0)
    # with open("Montagnards_withlimit.pickle", 'wb') as handle:
    # 	pickle.dump(Montagnards, handle, protocol = 0)
    # gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
    # mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

    # """with open("gir_tfidf.pickle", 'wb') as handle:
    # 	pickle.dump(gir_tfidf, handle, protocol = 0)
    # with open("mont_tfidf.pickle", 'wb') as handle:
    # 	pickle.dump(mont_tfidf, handle, protocol = 0)"""

    # # Computes the distance between the tf_idf vectors
    # #compute_distance(gir_tfidf, mont_tfidf)

    # # Stores the tf_idf vectors
    # df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index")
    # #df_gir_tfidf.columns = ['Bigrams', 'tfidf']
    # write_to_excel(df_gir_tfidf, 'gir_tfidf_withlimit.xlsx')
    # df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index")
    # #df_mont_tfidf.columns = ['Bigrams', 'tfidf']
    # write_to_excel(df_mont_tfidf, 'mont_tfidf_withlimit.xlsx')

    # df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
    # df_tfidf_combined = df_tfidf_combined.transpose()
    # df_tfidf_combined.columns = ["Girondins", "Montagnards"]
    # write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx')

    # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches
    # print gir_docs
    Girondins = {k: v
                 for k, v in Girondins.items()
                 if (v >= 10)}  #and (len(gir_docs[k]) > 1)}
    df_girondins = pd.DataFrame.from_dict(Girondins, orient="index")
    write_to_excel(df_girondins, "Girondins_counts_withlimit.xlsx")

    Montagnards = {k: v
                   for k, v in Montagnards.items()
                   if (v >= 10)}  #and (len(mont_docs[k]) > 1)}
    df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index")
    write_to_excel(df_montagnards, "Montagnards_counts_withlimit.xlsx")

    gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
    mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

    # # Normalizes the vectors and computes the distance between them
    # #normalized = normalize_dicts(Girondins, Montagnards)
    # #compute_distance(normalized[0], normalized[1])

    # Stores the Girondins and Montagnards frequency vectors in the same document
    df_combined = pd.DataFrame([Girondins, Montagnards])
    df_combined = df_combined.transpose()
    df_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_combined, 'combined_frequency_withlimit.xlsx')

    df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
    df_tfidf_combined = df_tfidf_combined.transpose()
    df_tfidf_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx')
Ejemplo n.º 11
0
def firststep():
	year_month = []
	full_date = []
	speaker = []
	ngrams = {}

	byyearmonth = pd.DataFrame()
	bydate = pd.DataFrame()
	byspeaker = pd.DataFrame()

	raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))
	dataframe = pd.DataFrame.from_dict(raw_speeches, orient = "index")
	dataframe.columns = ['Speeches']
	speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb"))
	file = open('num_speeches.txt', 'r')
	num_speeches = int(file.read())
	doc_freq = pickle.load(open("bigram_doc_freq.pickle", "rb"))

	for speechid in raw_speeches:
		speech_bigrams = compute_ngrams(raw_speeches[speechid], 2)
		ngrams[speechid] = speech_bigrams

		yearmonth = speechid[0:7]
		year_month.append(yearmonth)

		fulldate = speechid[0:10]
		full_date.append(fulldate)

		speaker.append(speechid_to_speaker[speechid])

	
	dataframe['Year-Month'] = pd.Series(year_month).values
	dataframe['Full Date'] = pd.Series(full_date).values
	dataframe['Speaker'] = pd.Series(speaker).values
	dataframe['Speechid'] = dataframe.index

	write_to_excel(dataframe, "raw_data.xlsx")
	"""with open("ngrams.pickle", "wb") as handle:
		pickle.dump(ngrams, handle, protocol = 0)"""

	"""byyearmonth['YearMonth'] = pd.Series(year_month).values
	byyearmonth['ngrams'] = pd.Series(ngrams).values

	byyearmonth_dict = pd.Series(byyearmonth.ngrams.values, index = byyearmonth.YearMonth).to_dict()

	with open("byyearmonth_dict.pickle", 'wb') as handle:
		pickle.dump(byyearmonth_dict, handle, protocol = 0)

	
	bydate['FullDate'] = pd.Series(full_date).values
	bydate['ngrams'] = pd.Series(ngrams).values

	bydate_dict = pd.Series(bydate.ngrams.values, index = bydate.FullDate).to_dict()

	with open("bydate_dict.pickle", 'wb') as handle:
		pickle.dump(bydate_dict, handle, protocol = 0)

	
	byspeaker['Speaker'] = pd.Series(speaker).values
	byspeaker['ngrams'] = pd.Series(ngrams).values

	byspeaker_dict = pd.Series(byspeaker.ngrams.values, index = byspeaker.Speaker).to_dict()

	with open("byspeaker_dict.pickle", 'wb') as handle:
		pickle.dump(byspeaker_dict, handle, protocol = 0)"""

	# compute ngrams for each speech
	# don't need tfidf because should just add the frequency vectors not the tfidf ones
	# extract year-month
	# extract year-month-date
	# make all of those individual columns and create a pandas dataframe
	# create a function for each grouping and do a pandas groupby

	"""byyearmonth = groupby_yearmonth(dataframe)
	write_to_excel(byyearmonth, "byyearmonth.xlsx")
	byyearmonth = None
	byspeaker = groupby_speaker(dataframe)
	write_to_excel(byspeaker, "byspeaker.xlsx")
	byspeaker = None
	bydate = groupby_date(dataframe)
	write_to_excel(bydate, "bydate.xlsx")
	bydate = None"""

	groupby_yearmonth(dataframe, ngrams)
	groupby_date(dataframe, ngrams)
	groupby_speaker(dataframe, ngrams)
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker,
              Girondins, Montagnards, Plein):
    speaker_names = set()
    speaker_num_speeches = {}
    speaker_char_count = {}
    speakers_to_consider = []

    bigrams_to_speeches = collections.defaultdict()
    bigram_doc_freq = collections.defaultdict()

    gir_num_speeches = 0
    mont_num_speeches = 0
    plein_num_speeches = 0
    gir_docs = {}
    mont_docs = {}
    plein_docs = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for identity in raw_speeches:
        date = re.findall(date_regex, str(identity))[0]
        if (date >= "1792-09-20") and (date <= "1793-06-02"):
            # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker
            # To potentially establish a cutoff for analysis purposes
            speaker_name = speechid_to_speaker[identity]
            party = ""
            if speaker_name in speakers_to_consider:
                party = speakers_to_analyze.loc[speaker_name, "Party"]
            else:
                party = "Plein"
            augment(speaker_num_speeches, speaker_name)
            if speaker_name in speaker_char_count:
                speaker_char_count[speaker_name] += len(raw_speeches[identity])
            else:
                speaker_char_count[speaker_name] = len(raw_speeches[identity])
            indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
            for bigram in indv_speech_bigram:
                augment(bigram_doc_freq, bigram)

                # Maintains a list of speeches in which given bigrams are spoken in
                if bigram in bigrams_to_speeches:
                    bigrams_to_speeches[bigram].append(identity)
                else:
                    bigrams_to_speeches[bigram] = []
                    bigrams_to_speeches[bigram].append(identity)

            # Augments the relevant variables according to the party the speaker belongs to
            if party == "Girondins":
                gir_num_speeches += 1
                gir_docs = check_num_speakers(indv_speech_bigram, speaker_name,
                                              gir_docs)
                try:
                    Girondins = Girondins + indv_speech_bigram
                except NameError:
                    Girondins = indv_speech_bigram
            elif party == "Montagnards":
                mont_num_speeches += 1
                mont_docs = check_num_speakers(indv_speech_bigram,
                                               speaker_name, mont_docs)
                try:
                    Montagnards = Montagnards + indv_speech_bigram
                except NameError:
                    Montagnards = indv_speech_bigram
            # Creates a Plein category that is neither Girondins or Montagnards to better understand speakers that are not distinctly one
            # or the other
            else:
                plein_num_speeches += 1
                plein_docs = check_num_speakers(indv_speech_bigram,
                                                speaker_name, plein_docs)
                try:
                    Plein = Plein + indv_speech_bigram
                except NameError:
                    Plein = indv_speech_bigram

                #speech = speech + indv_speech_bigram

        # Stores the bigram Counter object for each individual speaker
        """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
		with open(pickle_filename, 'wb') as handle:
			pickle.dump(speech, handle, protocol = 0)"""
    """# Stores the bigrams_to_speeches document in Excel
	df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index")
	write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')"""

    # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors
    num_speeches = gir_num_speeches + mont_num_speeches + plein_num_speeches
    print num_speeches

    with open('speaker_num_speeches_withplein.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    with open('speaker_char_count_withplein.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    w = csv.writer(open("speaker_num_speeches_withplein.csv", "w"))
    for key, val in speaker_num_speeches.items():
        w.writerow([key, val])

    w = csv.writer(open("speaker_char_count_withplein.csv", "w"))
    for key, val in speaker_char_count.items():
        w.writerow([key, val])

    # Write the number of speeches and doc_frequency to memory for use in further analysis
    with open('num_speeches_withplein.txt', 'w') as f:
        f.write('%d' % num_speeches)
    df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index")
    write_to_excel(df_doc_freq, 'doc_freq.xlsx')

    with open("bigram_doc_freq_withplein.pickle", 'wb') as handle:
        pickle.dump(bigram_doc_freq, handle, protocol=0)

    with open("Girondins_withplein.pickle", 'wb') as handle:
        pickle.dump(Girondins, handle, protocol=0)
    with open("Montagnards_withplein.pickle", 'wb') as handle:
        pickle.dump(Montagnards, handle, protocol=0)
    with open("Plein.pickle", 'wb') as handle:
        pickle.dump(Plein, handle, protocol=0)
    gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
    mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)
    plein_tfidf = compute_tfidf(Plein, num_speeches, bigram_doc_freq)
    """with open("gir_tfidf.pickle", 'wb') as handle:
		pickle.dump(gir_tfidf, handle, protocol = 0)
	with open("mont_tfidf.pickle", 'wb') as handle:
		pickle.dump(mont_tfidf, handle, protocol = 0)"""

    # Computes the distance between the tf_idf vectors
    #compute_distance(gir_tfidf, mont_tfidf)

    # Stores the tf_idf vectors
    df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient="index")
    #df_gir_tfidf.columns = ['Bigrams', 'tfidf']
    write_to_excel(df_gir_tfidf, 'gir_tfidf_withplein.xlsx')
    df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient="index")
    #df_mont_tfidf.columns = ['Bigrams', 'tfidf']
    write_to_excel(df_mont_tfidf, 'mont_tfidf_withplein.xlsx')
    df_plein_tfidf = pd.DataFrame.from_dict(plein_tfidf, orient="index")
    #df_mont_tfidf.columns = ['Bigrams', 'tfidf']
    write_to_excel(df_plein_tfidf, 'plein_tfidf.xlsx')

    df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
    df_tfidf_combined = df_tfidf_combined.transpose()
    df_tfidf_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_tfidf_combined, 'combined_tfidf_withplein.xlsx')

    # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches
    Girondins = {k: v
                 for k, v in Girondins.items()
                 if (v >= 3)}  #and (len(gir_docs[k]) > 1)}
    df_girondins = pd.DataFrame.from_dict(Girondins, orient="index")
    write_to_excel(df_girondins, "Girondins_counts_withplein.xlsx")

    Montagnards = {k: v
                   for k, v in Montagnards.items()
                   if (v >= 3)}  #and (len(mont_docs[k]) > 1)}
    df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index")
    write_to_excel(df_montagnards, "Montagnards_counts_withplein.xlsx")

    # Normalizes the vectors and computes the distance between them
    #normalized = normalize_dicts(Girondins, Montagnards)
    #compute_distance(normalized[0], normalized[1])

    # Stores the Girondins and Montagnards frequency vectors in the same document
    df_combined = pd.DataFrame([Girondins, Montagnards])
    df_combined = df_combined.transpose()
    df_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_combined, 'combined_frequency.xlsx')
Ejemplo n.º 13
0
"""

import pickle
import pandas as pd
from pandas import *
from processing_functions import write_to_excel

if __name__ == '__main__':
    import sys
    raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))
    speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb"))

    # Write just the raw speeches to Excel
    df = pd.DataFrame.from_dict(raw_speeches, orient = "index")
    filename = "raw_speeches.xlsx"
    write_to_excel(df, filename)
    """writer = pd.ExcelWriter(filename)
    df.to_excel(writer, 'Sheet1')
    writer.save()"""

    # Write just the speaker names to Excel
    df2 = pd.DataFrame.from_dict(speechid_to_speaker, orient = "index")
    filename2 = "speechid_to_speaker.xlsx"
    write_to_excel(df2, filename2)
    """writer2 = pd.ExcelWriter(filename2)
    df2.to_excel(writer2, 'Sheet1')
    writer2.save()"""

    # Concatenante the speeches with the speaker names to have all data in one Excel file
    joined = pd.concat([df,df2], axis = 1)
    filename3 = "speeches_and_speakers.xlsx"
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards):
	speaker_num_speeches = {}
	speaker_char_count = {}
	
	# Dataframe to keep track of the speakers we care about
	speakers_to_consider = []
	# Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches
	# and speechid_to_speaker
	for speaker in speakers_to_analyze.index.values:
		speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

	# Matches bigrams to the list of speakers and speeches that have that bigram
	bigrams_to_speeches = {}
	bigrams_to_speakers = {}

	# Maintains the number of documents a given bigram is spoken in for use with tf-idf
	bigram_doc_freq = collections.defaultdict()

	gir_num_speeches = 0
	mont_num_speeches = 0
	gir_docs = {}
	mont_docs = {}

	for speaker_name in speakers_to_consider:
		print speaker_name
		party = speakers_to_analyze.loc[speaker_name, "Party"]
		speech = Counter()
		for identity in raw_speeches:
			date = re.findall(date_regex, str(identity))[0]
			if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]):
				# Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker
				# To potentially establish a cutoff for analysis purposes
				augment(speaker_num_speeches, speaker_name)
				if speaker_name in speaker_char_count:
					speaker_char_count[speaker_name] += len(raw_speeches[identity])
				else:
					speaker_char_count[speaker_name] = len(raw_speeches[identity])

				indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)

				for bigram in indv_speech_bigram:
					augment(bigram_doc_freq, bigram)

					# Maintains a list of speeches in which given bigrams are spoken in
					if bigram in bigrams_to_speeches:
						bigrams_to_speeches[bigram].append(identity)
					else:
						bigrams_to_speeches[bigram] = []
						bigrams_to_speeches[bigram].append(identity)
					if bigram in bigrams_to_speakers:
						bigrams_to_speakers[bigram].add(speaker_name)
					else:
						bigrams_to_speakers[bigram] = set()
						bigrams_to_speakers[bigram].add(speaker_name)

				# Augments the relevant variables according to the party the speaker belongs to
				if party == "Girondins":
					gir_num_speeches += 1
					gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs)
					try:
						Girondins = Girondins + indv_speech_bigram
					except NameError:
						Girondins = indv_speech_bigram
				else:
					mont_num_speeches += 1
					mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs)
					try:
						Montagnards = Montagnards + indv_speech_bigram
					except NameError:
						Montagnards = indv_speech_bigram
			
				### Maintains a Counter of all the bigrams and their counts for a given speaker
				# speech = speech + indv_speech_bigram

	### Stores the bigram Counter object for each individual speaker
		# pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
		# with open(pickle_filename, 'wb') as handle:
		# 	pickle.dump(speech, handle, protocol = 0)

	# Store raw counts
	store_to_pickle(Girondins,"Girondins.pickle")
	store_to_pickle(Montagnards, "Montagnards.pickle")

	# Store in memory aggregate information about each bigram
	bigram_aggregate_info(Girondins, Montagnards, bigrams_to_speakers, bigrams_to_speeches)


	### If data has already been stored to memory, the lines below can be used
	# bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb"))
	# bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb"))

	# gir_docs = pickle.load(open("gir_docs.pickle", "rb"))
	# mont_docs = pickle.load(open("mont_docs.pickle", "rb"))

	# Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb"))
	# Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb"))

	# bigram_doc_freq = pickle.load(open("bigram_doc_freq.pickle", 'rb'))

	num_speeches = 4479

	# Computes counts and tfidf scores for each party and outputs for further analysis in R
	counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq)



	""" EVERYTHING BELOW IS STORING DATA TO MEMORY """
	
	# Stores the bigrams_to_speeches document in Excel
	df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index")
	write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')
	df_bigrams_to_speakers = pd.DataFrame.from_dict(bigrams_to_speakers, orient = "index")
	write_to_excel(df_bigrams_to_speakers, 'bigrams_to_speakers.xlsx')
	df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient = "index")
	write_to_excel(df_doc_freq, 'doc_freq.xlsx')
	
	# Stores files in memory
	store_to_pickle(bigrams_to_speakers, "bigrams_to_speakers.pickle")
	store_to_pickle(bigrams_to_speeches, "bigrams_to_speeches.pickle")
	store_to_pickle(gir_docs, "gir_docs.pickle")
	store_to_pickle(mont_docs, "mont_docs.pickle")
	store_to_pickle(speaker_num_speeches, "speaker_num_speeches.pickle")
	store_to_pickle(speaker_char_count, "speaker_char_count.pickle")
	store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle")

	with open('gir_speeches.txt', 'w') as f:
		f.write('%d' % gir_num_speeches)
	with open('mont_speeches.txt', 'w') as f:
		f.write('%d' % mont_num_speeches)

	write_to_csv(speaker_num_speeches, "speaker_num_speeches.csv")
	write_to_csv(speaker_char_count, "speaker_char_count.csv")

	with open('num_speeches.txt', 'w') as f:
		f.write('%d' % num_speeches)
        if date.attrs:
            relevant_dates.append(date)
    if (len(relevant_dates) > 0):
        return (relevant_dates[0]['value'])
    else:
        return ("error")


if __name__ == '__main__':
    import sys

    raw_speeches = {}
    multiple_speakers = {}
    parseFiles(raw_speeches, multiple_speakers)

    # Stores data in files to then be merged with AN dataset
    speaker_distances = pd.DataFrame(speaker_dists,
                                     columns=[
                                         "Speaker Name", "Levenshtein Dists",
                                         "Volno", "Date", "Departments/Notes"
                                     ])
    write_to_excel(speaker_distances, "speaker_distances.xlsx")

    speaker_distances_split = pd.DataFrame(speaker_dists_split,
                                           columns=[
                                               "Speaker Name", "Full Name",
                                               "Distance", "Volno", "Date",
                                               "Department/Notes"
                                           ])
    write_to_excel(speaker_distances_split, "speaker_distances_split.xlsx")
			ftnotes = ftnotes.replace("\n","").replace("\r","").replace("\t","").replace("  "," ")
			footnotes.append([ftnotes, speaker, speech_id, volno])
		number_of_speeches += 1


# Parses dates from file being analyzed
def extractDate(soup_file):
	dates = soup_file.find_all('date')
	relevant_dates = []
	for date in dates:
		if date.attrs:
			relevant_dates.append(date)
	if (len(relevant_dates) > 0):
		return(relevant_dates[0]['value'])
	else:
		return("error")

if __name__ == '__main__':
	import sys
	speaker_list = load_speakerlist('Copy of AP_Speaker_Authority_List_Edited_3.xlsx')

	raw_speeches = {}
	multiple_speakers = {}
	parseFiles(raw_speeches, multiple_speakers)

	footnotes = pd.DataFrame(footnotes, columns = ["Footnote", "Speaker", "Speechid", "Volno"])

	write_to_excel(footnotes, "footnotes.xlsx")