def firststep():

    year_month = {}
    byyearmonth = {}

    ngrams = {}

    raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))

    for speechid in raw_speeches:
        fulldate = speechid[0:10]
        if (fulldate >= "1792-06-10") and (fulldate <= "1793-08-02"):
            speech_bigrams = compute_ngrams(raw_speeches[speechid], 2)

            yearmonth = speechid[0:7]
            print yearmonth
            if yearmonth in byyearmonth:
                byyearmonth[
                    yearmonth] = byyearmonth[yearmonth] + speech_bigrams
            else:
                byyearmonth[yearmonth] = speech_bigrams
            speech_bigrams = None

            #year_month[speechid] = yearmonth
            #byyearmonth[speechid] = speech_bigrams
    print "here"

    with open("byyearmonth.pickle", "wb") as handle:
        pickle.dump(byyearmonth, handle, protocol=0)

    w = csv.writer(open("byyearmonth.csv", "w"))
    for key, val in byspeaker.items():
        w.writerow([key, val])
    """byyearmonth = pd.DataFrame.from_dict(byyearmonth, orient = "index")
def aggregate_by_speaker(speakers_to_analyze, raw_speeches, speechid_to_speaker):
	speaker_names = set()
	speakers_to_consider = []
	num_speeches = 0
	for speaker in speakers_to_analyze.index.values:
		speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))
	for speaker_name in speakers_to_consider:
		print speaker_name
		speech = ""
		for identity in raw_speeches:
			date = re.findall(date_regex, str(identity))[0]
			if (date >= "1792-09-20") and (speaker_name == speechid_to_speaker[identity]):
				num_speeches = num_speeches + 1
				add_to_docfreq(dict(compute_ngrams(speech)))
				speech = speech + " " + raw_speeches[identity]
		speaker_ngrams = compute_ngrams(speech)
		pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
		with open(pickle_filename, 'wb') as handle:
			pickle.dump(speaker_ngrams, handle, protocol = 0)
def create_tfidf_vectors(dataframe):
	speeches = dataframe['concat_speeches'].tolist()
	ngrams = []
	for unit in speeches:
		ngrams.append(compute_ngrams(unit, 2))
	ngrams_to_add = pd.Series(ngrams)
	dataframe['ngrams'] = ngrams_to_add.values
	tfidf = []
	for element in ngrams:
		tfidf.append(compute_tfidf(element, num_speeches, doc_freq))
	tfidf_to_add = pd.Series(tfidf)
	dataframe['tfidf'] = tfidf_to_add.values
	return dataframe
Exemple #4
0
def calculate_chronology(raw_speeches, speechid_to_speaker, speaker_list,
                         speakers_to_analyze, Girondins, Montagnards):
    speaker_ngrams = {}
    speakers_to_consider = []
    speaker_distances = collections.defaultdict()
    chronology = collections.defaultdict(dict)

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    row_entry_speechid = []
    row_entry_date = []
    for identity in raw_speeches:
        date = re.findall(date_regex, str(identity))[0]
        speaker_name = speechid_to_speaker[identity]
        if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                speaker_name in speakers_to_consider):
            party = speakers_to_analyze.loc[speaker_name, "Party"]
            indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
            # Store relevant information for each bigram
            for bigram in indv_speech_bigram:
                row_entry_speechid.append([
                    str(bigram), speaker_name, identity,
                    indv_speech_bigram[bigram], party
                ])
                row_entry_date.append([
                    str(bigram), speaker_name, date,
                    indv_speech_bigram[bigram], party
                ])

    chronology_speechid = pd.DataFrame(row_entry_speechid,
                                       columns=[
                                           "Bigram", "Speaker Name",
                                           "Speechid", "Num occurrences",
                                           "Party"
                                       ])
    chronology_date = pd.DataFrame(
        row_entry_date,
        columns=["Bigram", "Speaker Name", "Date", "Num occurrences", "Party"])

    # w = csv.writer(open("chronology.csv", "w"))
    # for key, val in chronology.items():
    # 	if (Girondins[key] >= 10) or (Montagnards[key] >= 10):
    # 		w.writerow([key,val])
    make_visualizations(chronology_date)

    write_to_excel(chronology_speechid, "chronology_speechid.xlsx")
    write_to_excel(chronology_date, "chronology_date.xlsx")

    store_to_pickle(chronology_speechid, "chronology_speechid.pickle")
    store_to_pickle(chronology_date, "chronology_date.pickle")
Exemple #5
0
def firststep():

    byspeaker = {}
    speakerdict = {}

    byspeaker_allspeakers = {}
    speakerdict_allspeakers = {}

    ngrams = {}

    speakers_to_consider = []

    raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))
    # dataframe = pd.DataFrame.from_dict(raw_speeches, orient = "index")
    # dataframe.columns = ['Speeches']
    speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb"))
    # file = open('num_speeches.txt', 'r')
    # num_speeches = int(file.read())
    # doc_freq = pickle.load(open("bigram_doc_freq.pickle", "rb"))
    speakers_to_analyze = load_list("Girondins and Montagnards New Mod.xlsx")

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speechid in raw_speeches:
        fulldate = speechid[0:10]
        if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"):
            speech_bigrams = compute_ngrams(raw_speeches[speechid], 2)

            speaker = speechid_to_speaker[speechid]

            print speaker

            if speaker in byspeaker_allspeakers:
                byspeaker_allspeakers[
                    speaker] = byspeaker_allspeakers[speaker] + speech_bigrams
            else:
                byspeaker_allspeakers[speaker] = speech_bigrams
            speech_bigrams = None

    with open("byspeaker_allspeakers.pickle", "wb") as handle:
        pickle.dump(byspeaker_allspeakers, handle, protocol=0)

    w = csv.writer(open("byspeaker_allspeakers.csv", "w"))
    for key, val in byspeaker.items():
        w.writerow([key, val])
    """byspeaker_allspeakers = pd.DataFrame.from_dict(byspeaker_allspeakers, orient = "index")
def aggregate_by_speaker():

    byspeaker = {}
    speakerdict = {}

    ngrams = {}

    speakers_to_consider = []

    raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))
    speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb"))
    speakers_to_analyze = load_list(
        "Girondins and Montagnards New Mod Limit.xlsx")
    speaker_num_words = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speechid in raw_speeches:
        fulldate = speechid[0:10]
        if (fulldate >= "1792-09-20") and (fulldate <= "1793-06-02"):
            num_words = len(raw_speeches[speechid].split())
            speech_bigrams = compute_ngrams(raw_speeches[speechid], 2)

            speaker = speechid_to_speaker[speechid]

            if speaker in speaker_num_words:
                speaker_num_words[speaker] += num_words
            else:
                speaker_num_words[speaker] = num_words

            if speaker in speakers_to_consider:
                if speaker in byspeaker:
                    byspeaker[speaker] = byspeaker[speaker] + speech_bigrams
                else:
                    byspeaker[speaker] = speech_bigrams
            speech_bigrams = None

    write_to_csv(byspeaker)
    store_to_pickle(byspeaker)

    write_to_csv(speaker_num_words)
    store_to_pickle(speaker_num_words)
def aggregate_by_speaker(speakers_to_analyze, raw_speeches,
                         speechid_to_speaker):
    speaker_names = set()
    speakers_to_consider = []
    gir_num_speeches = 0
    mont_num_speeches = 0
    bigrams_speeches = collections.defaultdict()
    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))
    for speaker_name in speakers_to_consider:
        print speaker_name
        party = speakers_to_analyze.loc[speaker_name, "Party"]
        speech = Counter()
        for identity in raw_speeches:
            date = re.findall(date_regex, str(identity))[0]
            if (date >= "1792-09-20") and (speaker_name
                                           == speechid_to_speaker[identity]):
                indv_speech_ngram = compute_ngrams(raw_speeches[identity])
                for bigram in indv_speech_ngram:
                    if bigram in bigrams_speeches:
                        bigrams_speeches[bigram].append(identity)
                    else:
                        bigrams_speeches[bigram] = []
                        bigrams_speeches[bigram].append(identity)
                if party == "Girondins":
                    gir_num_speeches += 1
                else:
                    mont_num_speeches += 1
                speech = speech + indv_speech_ngram
        #speaker_ngrams = compute_ngrams(speech)
        pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
        with open(pickle_filename, 'wb') as handle:
            pickle.dump(speech, handle, protocol=0)

    with open('bigrams_to_speeches.csv', 'wb') as outfile:
        writer = csv.writer(outfile)
        for key, val in bigrams_speeches.items():
            writer.writerow([key, val])

    print gir_num_speeches
    print mont_num_speeches
Exemple #8
0
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker):
    speakers_to_consider = []
    speaker_bigram_frequencies = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speaker_name in speakers_to_consider:
        print speaker_name
        speaker_bigram_frequencies = {}
        party = speakers_to_analyze.loc[speaker_name, "Party"]
        speech = Counter()
        for identity in raw_speeches:
            date = re.findall(date_regex, str(identity))[0]
            if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                    speaker_name == speechid_to_speaker[identity]):

                indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)

                for bigram in indv_speech_bigram:
                    if bigram in speaker_bigram_frequencies:
                        #speechid_frequencies = speaker_bigram_frequencies[bigram]
                        #speechid_frequencies[speechid] = indv_speech_bigram[bigram]
                        speaker_bigram_frequencies[bigram][
                            identity] = indv_speech_bigram[bigram]
                    else:
                        speaker_bigram_frequencies[bigram] = {}
                        speaker_bigram_frequencies[bigram][
                            identity] = indv_speech_bigram[bigram]
        filename_pickle = "" + speaker_name + "bigram_frequencies.pickle"
        with open(filename_pickle, 'wb') as handle:
            pickle.dump(speaker_bigram_frequencies, handle, protocol=0)
        filename_csv = "" + speaker_name + "bigram_frequencies.csv"
        w = csv.writer(open(filename_csv, "w"))
        for key, val in speaker_bigram_frequencies.items():
            w.writerow([key, val])
Exemple #9
0
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker):
    speaker_names = set()
    speakers_to_consider = []

    # Initialize various data frames for export to the classification script
    train_total_freq_unigram = {}
    test_total_freq_unigram = {}
    train_total_freq_bigram = {}
    test_total_freq_bigram = {}
    train_number_speeches = 0
    test_number_speeches = 0
    # Keeps track of which speeches contain the given bigram
    train_speeches_bigram = collections.defaultdict(dict)
    test_speeches_bigram = collections.defaultdict(dict)
    train_speeches_unigram = collections.defaultdict(dict)
    test_speeches_unigram = collections.defaultdict(dict)

    bigrams_to_speeches = collections.defaultdict()
    bigram_doc_freq = collections.defaultdict()
    unigram_doc_freq = collections.defaultdict()

    gir_num_speeches = 0
    mont_num_speeches = 0
    gir_docs = {}
    mont_docs = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speaker_name in speakers_to_consider:
        print speaker_name
        party = speakers_to_analyze.loc[speaker_name, "Party"]
        speech = Counter()
        speech_num = 0
        for identity in raw_speeches:
            date = re.findall(date_regex, str(identity))[0]
            if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                    speaker_name == speechid_to_speaker[identity]):
                # Only looking at speeches with substance, so greater than 100 characters
                if len(raw_speeches[identity]) >= 100:
                    indv_speech_bigram = compute_ngrams(
                        raw_speeches[identity], 2)
                    indv_speech_unigram = compute_ngrams(
                        raw_speeches[identity], 1)
                    # Splitting the data into training and test data with 1/4 of each speaker's data in the test set
                    if speech_num % 4 != 0:
                        train_number_speeches += 1
                        for bigram in indv_speech_bigram:
                            augment(bigram_doc_freq, bigram)
                            augment(train_total_freq_bigram, bigram)
                        for unigram in indv_speech_unigram:
                            augment(unigram_doc_freq, unigram)
                            augment(train_total_freq_unigram, unigram)
                        train_speeches_bigram[identity] = indv_speech_bigram
                        train_speeches_unigram[identity] = indv_speech_unigram
                    else:
                        test_number_speeches += 1
                        for bigram in indv_speech_bigram:
                            augment(test_total_freq_bigram, bigram)
                        for unigram in indv_speech_unigram:
                            augment(test_total_freq_unigram, unigram)
                        test_speeches_bigram[identity] = indv_speech_bigram
                        test_speeches_unigram[identity] = indv_speech_unigram

                    speech_num += 1

    # Write all relevant data objects and values to memory to use when running classification
    with open("speechid_to_speaker_store.pickle", 'wb') as handle:
        pickle.dump(speechid_to_speaker, handle, protocol=0)
    speechid_to_speaker = None
    with open("speakers_to_analyze_store.pickle", 'wb') as handle:
        pickle.dump(speakers_to_analyze, handle, protocol=0)
    speakers_to_analyze = None
    raw_speeches = None

    with open("train_speeches_bigram.pickle", 'wb') as handle:
        pickle.dump(train_speeches_bigram, handle, protocol=0)
    with open("train_speeches_unigram.pickle", 'wb') as handle:
        pickle.dump(train_speeches_unigram, handle, protocol=0)
    with open("train_total_freq_bigram.pickle", 'wb') as handle:
        pickle.dump(train_total_freq_bigram, handle, protocol=0)
    with open("train_total_freq_unigram.pickle", 'wb') as handle:
        pickle.dump(train_total_freq_unigram, handle, protocol=0)

    with open("bigram_doc_freq.pickle", 'wb') as handle:
        pickle.dump(bigram_doc_freq, handle, protocol=0)
    with open("unigram_doc_freq.pickle", 'wb') as handle:
        pickle.dump(unigram_doc_freq, handle, protocol=0)
    with open("train_number_speeches.pickle", 'wb') as handle:
        pickle.dump(train_number_speeches, handle, protocol=0)

    with open("test_speeches_bigram.pickle", 'wb') as handle:
        pickle.dump(test_speeches_bigram, handle, protocol=0)
    with open("test_speeches_unigram.pickle", 'wb') as handle:
        pickle.dump(test_speeches_unigram, handle, protocol=0)
    with open("test_total_freq_bigram.pickle", 'wb') as handle:
        pickle.dump(test_total_freq_bigram, handle, protocol=0)
    with open("test_total_freq_unigram.pickle", 'wb') as handle:
        pickle.dump(test_total_freq_unigram, handle, protocol=0)
Exemple #10
0
def calculate_chronology(raw_speeches, speechid_to_speaker, speaker_list,
                         speakers_to_analyze, Girondins, Montagnards):
    speaker_ngrams = {}
    speakers_to_consider = []
    speaker_distances = collections.defaultdict()
    chronology = collections.defaultdict(dict)
    # chronology_date = pd.DataFrame(columns = ["Bigram", "Speaker Name", "Date", "Num occurrences"])
    # chronology_speechid = pd.DataFrame(columns = ["Bigram", "Speaker Name", "Speechid", "Num occurrences"])

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    row_entry_speechid = []
    row_entry_date = []
    for identity in raw_speeches:
        date = re.findall(date_regex, str(identity))[0]
        speaker_name = speechid_to_speaker[identity]
        # print speaker_name
        if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                speaker_name in speakers_to_consider):
            party = speakers_to_analyze.loc[speaker_name, "Party"]
            indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
            for bigram in indv_speech_bigram:
                row_entry_speechid.append([
                    str(bigram), speaker_name, identity,
                    indv_speech_bigram[bigram], party
                ])
                # chronology_speechid = chronology_speechid.append(pd.Series(row_entry_speechid), ignore_index = True)
                row_entry_date.append([
                    str(bigram), speaker_name, date,
                    indv_speech_bigram[bigram], party
                ])
                # chronology_date = chronology_date.append(pd.Series(row_entry_date), ignore_index = True)
                # if bigram in chronology:
                # 	chronology[bigram].append([speaker_name, identity, indv_speech_bigram[bigram]])
                # else:
                # 	chronology[bigram] = []
                # 	chronology[bigram].append([speaker_name, identity, indv_speech_bigram[bigram]])

    chronology_speechid = pd.DataFrame(row_entry_speechid,
                                       columns=[
                                           "Bigram", "Speaker Name",
                                           "Speechid", "Num occurrences",
                                           "Party"
                                       ])
    chronology_date = pd.DataFrame(
        row_entry_date,
        columns=["Bigram", "Speaker Name", "Date", "Num occurrences", "Party"])

    # Create ngram column, speaker name, date, number of occurrences
    # Create two dataframes, one with date and one with speechid
    # Include volume number
    # Do groupby and aggregation methods

    # w = csv.writer(open("chronology.csv", "w"))
    # for key, val in chronology.items():
    # 	if (Girondins[key] >= 10) or (Montagnards[key] >= 10):
    # 		w.writerow([key,val])
    make_visualizations(chronology_date)

    # write_to = pd.ExcelWriter("chronology_speechid.xlsx")
    # chronology_speechid.to_excel(write_to, 'Sheet1')
    # write_to.save()

    # filename = pd.ExcelWriter("chronology_date.xlsx")
    # chronology_date.to_excel(write_to, 'Sheet1')
    # filename.save()

    pickle_filename_2 = "chronology_speechid.pickle"
    with open(pickle_filename_2, 'wb') as handle:
        pickle.dump(chronology_speechid, handle, protocol=0)

    pickle_filename = "chronology_date.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(chronology_date, handle, protocol=0)
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker):
	# Dataframe to keep track of the speakers we care about
	speakers_to_consider = []
	# Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches
	# and speechid_to_speaker
	for speaker in speakers_to_analyze.index.values:
		speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

	# Initialize various data frames for export to the classification script
	train_total_freq_unigram = {}
	test_total_freq_unigram = {}
	train_total_freq_bigram = {}
	test_total_freq_bigram = {}
	train_number_speeches = 0
	test_number_speeches = 0
	
	# Keeps track of which speeches contain the given bigram
	train_speeches_bigram = collections.defaultdict(dict)
	test_speeches_bigram = collections.defaultdict(dict)
	train_speeches_unigram = collections.defaultdict(dict)
	test_speeches_unigram = collections.defaultdict(dict)

	bigrams_to_speeches = collections.defaultdict()
	bigram_doc_freq = collections.defaultdict()
	unigram_doc_freq = collections.defaultdict()

	gir_num_speeches = 0
	mont_num_speeches = 0
	gir_docs = {}
	mont_docs = {}

	for speaker_name in speakers_to_consider:
		print speaker_name
		party = speakers_to_analyze.loc[speaker_name, "Party"]
		speech = Counter()
		# Variable to keep track of a given speaker's number of speeches
		speech_num = 0
		for identity in raw_speeches:
			date = re.findall(date_regex, str(identity))[0]
			# Only look at speeches within the date frame and that are from the speaker of interest
			if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]):
				# Only looking at speeches with substance, so greater than 100 characters
				if len(raw_speeches[identity]) >= 100:
					indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
					indv_speech_unigram = compute_ngrams(raw_speeches[identity], 1)
					# Splitting the data into training and test data with 1/4 of each speaker's data in the test set
					if speech_num%4 != 0:
						train_number_speeches += 1
						for bigram in indv_speech_bigram:
							augment(bigram_doc_freq, bigram)
							augment(train_total_freq_bigram, bigram)
						for unigram in indv_speech_unigram:
							augment(unigram_doc_freq, unigram)
							augment(train_total_freq_unigram, unigram)
						train_speeches_bigram[identity] = indv_speech_bigram
						train_speeches_unigram[identity] = indv_speech_unigram
					else:
						test_number_speeches += 1
						for bigram in indv_speech_bigram:
							augment(test_total_freq_bigram, bigram)
						for unigram in indv_speech_unigram:
							augment(test_total_freq_unigram, unigram)
						test_speeches_bigram[identity] = indv_speech_bigram
						test_speeches_unigram[identity] = indv_speech_unigram

					speech_num += 1
		
	# Write all relevant data objects and values to memory to use when running classification
	store_to_pickle(speakers_to_analyze, "speakers_to_analyze.pickle")
	
	# Set these dataframes to None to conserve memory
	speakers_to_analyze = None
	speechid_to_speaker = None
	raw_speeches = None

	store_to_pickle(train_speeches_bigram, "train_speeches_bigram.pickle")
	store_to_pickle(train_speeches_unigram, "train_speeches_unigram.pickle")
	store_to_pickle(train_total_freq_bigram, "train_total_freq_bigram.pickle")
	store_to_pickle(train_total_freq_unigram, "train_total_freq_unigram.pickle")

	store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle")
	store_to_pickle(unigram_doc_freq, "unigram_doc_freq.pickle")
	store_to_pickle(train_number_speeches, "train_number_speeches.pickle")

	store_to_pickle(test_speeches_bigram, "test_speeches_bigram.pickle")
	store_to_pickle(test_speeches_unigram, "test_speeches_unigram.pickle")
	store_to_pickle(test_total_freq_bigram, "test_total_freq_bigram.pickle")
	store_to_pickle(test_total_freq_unigram, "test_total_freq_unigram.pickle")
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker, Girondins, Montagnards):
	speaker_num_speeches = {}
	speaker_char_count = {}
	
	# Dataframe to keep track of the speakers we care about
	speakers_to_consider = []
	# Reformats speakers_to_analyze by removing accents in order to match speakers to those in raw_speeches
	# and speechid_to_speaker
	for speaker in speakers_to_analyze.index.values:
		speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

	# Matches bigrams to the list of speakers and speeches that have that bigram
	bigrams_to_speeches = {}
	bigrams_to_speakers = {}

	# Maintains the number of documents a given bigram is spoken in for use with tf-idf
	bigram_doc_freq = collections.defaultdict()

	gir_num_speeches = 0
	mont_num_speeches = 0
	gir_docs = {}
	mont_docs = {}

	for speaker_name in speakers_to_consider:
		print speaker_name
		party = speakers_to_analyze.loc[speaker_name, "Party"]
		speech = Counter()
		for identity in raw_speeches:
			date = re.findall(date_regex, str(identity))[0]
			if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name == speechid_to_speaker[identity]):
				# Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker
				# To potentially establish a cutoff for analysis purposes
				augment(speaker_num_speeches, speaker_name)
				if speaker_name in speaker_char_count:
					speaker_char_count[speaker_name] += len(raw_speeches[identity])
				else:
					speaker_char_count[speaker_name] = len(raw_speeches[identity])

				indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)

				for bigram in indv_speech_bigram:
					augment(bigram_doc_freq, bigram)

					# Maintains a list of speeches in which given bigrams are spoken in
					if bigram in bigrams_to_speeches:
						bigrams_to_speeches[bigram].append(identity)
					else:
						bigrams_to_speeches[bigram] = []
						bigrams_to_speeches[bigram].append(identity)
					if bigram in bigrams_to_speakers:
						bigrams_to_speakers[bigram].add(speaker_name)
					else:
						bigrams_to_speakers[bigram] = set()
						bigrams_to_speakers[bigram].add(speaker_name)

				# Augments the relevant variables according to the party the speaker belongs to
				if party == "Girondins":
					gir_num_speeches += 1
					gir_docs = check_num_speakers(indv_speech_bigram, speaker_name, gir_docs)
					try:
						Girondins = Girondins + indv_speech_bigram
					except NameError:
						Girondins = indv_speech_bigram
				else:
					mont_num_speeches += 1
					mont_docs = check_num_speakers(indv_speech_bigram, speaker_name, mont_docs)
					try:
						Montagnards = Montagnards + indv_speech_bigram
					except NameError:
						Montagnards = indv_speech_bigram
			
				### Maintains a Counter of all the bigrams and their counts for a given speaker
				# speech = speech + indv_speech_bigram

	### Stores the bigram Counter object for each individual speaker
		# pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
		# with open(pickle_filename, 'wb') as handle:
		# 	pickle.dump(speech, handle, protocol = 0)

	# Store raw counts
	store_to_pickle(Girondins,"Girondins.pickle")
	store_to_pickle(Montagnards, "Montagnards.pickle")

	# Store in memory aggregate information about each bigram
	bigram_aggregate_info(Girondins, Montagnards, bigrams_to_speakers, bigrams_to_speeches)


	### If data has already been stored to memory, the lines below can be used
	# bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb"))
	# bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb"))

	# gir_docs = pickle.load(open("gir_docs.pickle", "rb"))
	# mont_docs = pickle.load(open("mont_docs.pickle", "rb"))

	# Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb"))
	# Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb"))

	# bigram_doc_freq = pickle.load(open("bigram_doc_freq.pickle", 'rb'))

	num_speeches = 4479

	# Computes counts and tfidf scores for each party and outputs for further analysis in R
	counts_and_tfidf(Girondins, Montagnards, gir_docs, mont_docs, num_speeches, bigram_doc_freq)



	""" EVERYTHING BELOW IS STORING DATA TO MEMORY """
	
	# Stores the bigrams_to_speeches document in Excel
	df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index")
	write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')
	df_bigrams_to_speakers = pd.DataFrame.from_dict(bigrams_to_speakers, orient = "index")
	write_to_excel(df_bigrams_to_speakers, 'bigrams_to_speakers.xlsx')
	df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient = "index")
	write_to_excel(df_doc_freq, 'doc_freq.xlsx')
	
	# Stores files in memory
	store_to_pickle(bigrams_to_speakers, "bigrams_to_speakers.pickle")
	store_to_pickle(bigrams_to_speeches, "bigrams_to_speeches.pickle")
	store_to_pickle(gir_docs, "gir_docs.pickle")
	store_to_pickle(mont_docs, "mont_docs.pickle")
	store_to_pickle(speaker_num_speeches, "speaker_num_speeches.pickle")
	store_to_pickle(speaker_char_count, "speaker_char_count.pickle")
	store_to_pickle(bigram_doc_freq, "bigram_doc_freq.pickle")

	with open('gir_speeches.txt', 'w') as f:
		f.write('%d' % gir_num_speeches)
	with open('mont_speeches.txt', 'w') as f:
		f.write('%d' % mont_num_speeches)

	write_to_csv(speaker_num_speeches, "speaker_num_speeches.csv")
	write_to_csv(speaker_char_count, "speaker_char_count.csv")

	with open('num_speeches.txt', 'w') as f:
		f.write('%d' % num_speeches)
def aggregate(speakers_to_analyze_train, speakers_to_analyze_test,
              raw_speeches, speechid_to_speaker, Girondins, Montagnards):
    speaker_names = set()
    speakers_to_consider = []
    train_total_freq_unigram = {}
    test_total_freq_unigram = {}
    train_total_freq_bigram = {}
    test_total_freq_bigram = {}
    train_number_speeches = 0
    test_number_speeches = 0
    # Keeps track of which speeches contain the given bigram
    train_speeches_bigram = collections.defaultdict(dict)
    test_speeches_bigram = collections.defaultdict(dict)
    train_speeches_unigram = collections.defaultdict(dict)
    test_speeches_unigram = collections.defaultdict(dict)
    ### Need to do all the following code for train and test
    for speaker in speakers_to_analyze_train.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))
    for speaker in speakers_to_analyze_test.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))
    for speaker_name in speakers_to_consider:
        print speaker_name
        if speaker_name in speakers_to_analyze_train.index.values:
            party = speakers_to_analyze_train.loc[speaker_name, "Party"]
        else:
            party = speakers_to_analyze_test.loc[speaker_name, "Party"]
        speech = Counter()
        for identity in raw_speeches:
            date = re.findall(date_regex, str(identity))[0]
            if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                    speaker_name == speechid_to_speaker[identity]):
                indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
                indv_speech_unigram = compute_ngrams(raw_speeches[identity], 1)
                if speaker_name in speakers_to_analyze_train.index.values:
                    train_number_speeches += 1
                    for bigram in indv_speech_bigram:
                        if bigram in bigram_doc_freq:
                            bigram_doc_freq[
                                bigram] = bigram_doc_freq[bigram] + 1
                        else:
                            bigram_doc_freq[bigram] = 1
                        if bigram in train_total_freq_bigram:
                            train_total_freq_bigram[bigram] += 1
                        else:
                            train_total_freq_bigram[bigram] = 1
                    for unigram in indv_speech_unigram:
                        if unigram in unigram_doc_freq:
                            unigram_doc_freq[
                                unigram] = unigram_doc_freq[unigram] + 1
                        else:
                            unigram_doc_freq[unigram] = 1
                        if unigram in train_total_freq_unigram:
                            train_total_freq_unigram[unigram] += 1
                        else:
                            train_total_freq_unigram[unigram] = 1
                    train_speeches_bigram[identity] = indv_speech_bigram
                    train_speeches_unigram[identity] = indv_speech_unigram
                else:
                    test_number_speeches += 1
                    for bigram in indv_speech_bigram:
                        if bigram in bigram_doc_freq:
                            bigram_doc_freq[
                                bigram] = bigram_doc_freq[bigram] + 1
                        else:
                            bigram_doc_freq[bigram] = 1
                        if bigram in test_total_freq_bigram:
                            test_total_freq_bigram[bigram] += 1
                        else:
                            test_total_freq_bigram[bigram] = 1
                    for unigram in indv_speech_unigram:
                        if unigram in unigram_doc_freq:
                            unigram_doc_freq[
                                unigram] = unigram_doc_freq[unigram] + 1
                        else:
                            unigram_doc_freq[unigram] = 1
                        if unigram in test_total_freq_unigram:
                            test_total_freq_unigram[unigram] += 1
                        else:
                            test_total_freq_unigram[unigram] = 1
                    test_speeches_bigram[identity] = indv_speech_bigram
                    test_speeches_unigram[identity] = indv_speech_unigram
                """if party == "Girondins":
					gir_num_speeches += 1
					gir_doc_freq = check_num_speakers(indv_speech_ngram, speaker_name, gir_doc_freq)
					try:
						Girondins = Girondins + indv_speech_ngram
					except NameError:
						Girondins = indv_speech_ngram
				else:
					mont_num_speeches += 1
					mont_doc_freq = check_num_speakers(indv_speech_ngram, speaker_name, mont_doc_freq)
					try:
						Montagnards = Montagnards + indv_speech_ngram
					except NameError:
						Montagnards = indv_speech_ngram"""
                #speech = speech + indv_speech_ngram
        #speaker_ngrams = compute_ngrams(speech)
        """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
		with open(pickle_filename, 'wb') as handle:
			pickle.dump(speech, handle, protocol = 0)"""

    #NEED TO ADD CODE TO DO TRAINING AND TESTING SETS (import both excel files and do same computations)
    # Do unigrams as well
    classification = []
    training_set = []
    for speechid in train_speeches_bigram:
        speaker = speechid_to_speaker[speechid]
        if speakers_to_analyze_train.loc[speaker, "Party"] == "Girondins":
            classification.append(0)
        else:
            classification.append(1)
        # add some doc freq cutoff here
        bigram_input = {
            k: v
            for k, v in train_speeches_bigram[speechid].items()
            if (train_total_freq_bigram[k] >= 10)
        }
        unigram_input = {
            k: v
            for k, v in train_speeches_unigram[speechid].items()
            if (train_total_freq_unigram[k] >= 55)
        }

        bigram_scores = compute_tfidf(bigram_input, train_number_speeches,
                                      "bigram")
        unigram_scores = compute_tfidf(unigram_input, train_number_speeches,
                                       "unigram")

        merge_scores = bigram_scores.copy()
        merge_scores.update(unigram_scores)

        training_set.append(merge_scores)
        #training_set.append(bigram_scores)
        #training_set.append(unigram_scores)

        ### for if only doing unigrams
        #training_set.append(unigram_scores)
    """for speechid in train_speeches_unigram:
		speaker = speechid_to_speaker[speechid]
		if speakers_to_analyze_train.loc[speaker, "Party"] == "Girondins":
			classification.append(0)
		else:
			classification.append(1)
		# add some doc freq cutoff here
		unigram_input = {k:v for k,v in train_speeches_unigram[speechid].items() if (train_total_freq_unigram[k] >= 80)}
		scores = compute_tfidf(unigram_input, train_number_speeches, "unigram")
		training_set.append(scores)"""

    #party = pd.Series(classification)

    # loop through and count how many times each bigram appears, create new dataset that only has those bigrams
    # x is train.values and y is classification to pass into classifier, scikitlearn svm xgboost
    # key is to do feature engineering
    # 10 fold CV, start low re: features then work high and see if the score gets better
    train = pd.DataFrame(training_set)
    train = train.fillna(0)
    print train

    train, train_classification = data_clean(
        speechid_to_speaker, speakers_to_analyze_train, train_speeches_bigram,
        train_speeches_unigram, train_total_freq_bigram,
        train_total_freq_unigram, train_number_speeches)
    writer = pd.ExcelWriter("training_set.xlsx")
    train.to_excel(writer, 'Sheet1')
    writer.save()
    """test_classification = []
	test_set = []
	for speechid in test_speeches_bigram:
		speaker = speechid_to_speaker[speechid]
		if speakers_to_analyze_train.loc[speaker, "Party"] == "Girondins":
			test_classification.append(0)
		else:
			test_classification.append(1)
		# add some doc freq cutoff here
		bigram_input = {k:v for k,v in test_speeches_bigram[speechid].items() if (test_total_freq_bigram[k] >= 12)}
		unigram_input = {k:v for k,v in test_speeches_unigram[speechid].items() if (test_total_freq_unigram[k] >= 50)}
		
		bigram_scores = compute_tfidf(bigram_input, test_number_speeches, "bigram")
		unigram_scores = compute_tfidf(unigram_input, test_number_speeches, "unigram")
		
		merge_scores = bigram_scores.copy()
		merge_scores.update(unigram_scores)
		
		test_set.append(merge_scores)

	test = pd.DataFrame(test_set)
	test = test.fillna(0)"""

    logreg = LogisticRegression()
    logreg.fit(train.get_values(), classification)
    predicted = cross_validation.cross_val_predict(LogisticRegression(),
                                                   train.get_values(),
                                                   classification,
                                                   cv=10)
    print metrics.accuracy_score(classification, predicted)

    #print logreg.score(test.get_values(), test_classification)

    # columns should be bigrams
    #print train
    """with open('bigrams_to_speeches.csv', 'wb') as outfile:
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker,
              Girondins, Montagnards, Plein):
    speaker_names = set()
    speaker_num_speeches = {}
    speaker_char_count = {}
    speakers_to_consider = []

    bigrams_to_speeches = {}
    bigrams_to_speakers = {}
    bigram_doc_freq = collections.defaultdict()

    gir_num_speeches = 0
    mont_num_speeches = 0
    gir_docs = {}
    mont_docs = {}
    plein_docs = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for speaker_name in speakers_to_consider:
        print speaker_name
        party = speakers_to_analyze.loc[speaker_name, "Party"]
        speech = Counter()
        for identity in raw_speeches:
            date = re.findall(date_regex, str(identity))[0]
            if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                    speaker_name == speechid_to_speaker[identity]):
                # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker
                # To potentially establish a cutoff for analysis purposes
                augment(speaker_num_speeches, speaker_name)
                if speaker_name in speaker_char_count:
                    speaker_char_count[speaker_name] += len(
                        raw_speeches[identity])
                else:
                    speaker_char_count[speaker_name] = len(
                        raw_speeches[identity])

                indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)

                for bigram in indv_speech_bigram:
                    augment(bigram_doc_freq, bigram)

                    # Maintains a list of speeches in which given bigrams are spoken in
                    if bigram in bigrams_to_speeches:
                        bigrams_to_speeches[bigram].append(identity)
                    else:
                        bigrams_to_speeches[bigram] = []
                        bigrams_to_speeches[bigram].append(identity)
                    if bigram in bigrams_to_speakers:
                        bigrams_to_speakers[bigram].add(speaker_name)
                    else:
                        bigrams_to_speakers[bigram] = set()
                        bigrams_to_speakers[bigram].add(speaker_name)

                # Augments the relevant variables according to the party the speaker belongs to
                if party == "Girondins":
                    gir_num_speeches += 1
                    gir_docs = check_num_speakers(indv_speech_bigram,
                                                  speaker_name, gir_docs)
                    try:
                        Girondins = Girondins + indv_speech_bigram
                    except NameError:
                        Girondins = indv_speech_bigram
                else:
                    mont_num_speeches += 1
                    mont_docs = check_num_speakers(indv_speech_bigram,
                                                   speaker_name, mont_docs)
                    try:
                        Montagnards = Montagnards + indv_speech_bigram
                    except NameError:
                        Montagnards = indv_speech_bigram

                #speech = speech + indv_speech_bigram

    # 	# Stores the bigram Counter object for each individual speaker
    # 	"""pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
    # 	with open(pickle_filename, 'wb') as handle:
    # 		pickle.dump(speech, handle, protocol = 0)"""

    # Stores the bigrams_to_speeches document in Excel
    df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches,
                                                    orient="index")
    write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')
    pickle_filename = "bigrams_to_speakers.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(bigrams_to_speakers, handle, protocol=0)

    pickle_filename = "bigrams_to_speeches.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(bigrams_to_speeches, handle, protocol=0)

    pickle_filename = "bigrams_to_speeches.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(bigrams_to_speeches, handle, protocol=0)

    pickle_filename = "gir_docs.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(gir_docs, handle, protocol=0)

    pickle_filename = "mont_docs.pickle"
    with open(pickle_filename, 'wb') as handle:
        pickle.dump(mont_docs, handle, protocol=0)

    # bigrams_to_speakers = pickle.load(open("bigrams_to_speakers.pickle", "rb"))
    # bigrams_to_speeches = pickle.load(open("bigrams_to_speeches.pickle", "rb"))

    # gir_docs = pickle.load(open("gir_docs.pickle", "rb"))
    # mont_docs = pickle.load(open("mont_docs.pickle", "rb"))

    # Girondins = pickle.load(open("Girondins_withlimit.pickle", "rb"))
    # Montagnards = pickle.load(open("Montagnards_withlimit.pickle", "rb"))

    bigram_num_speakers = []
    bigram_num_speeches = []
    bigram_total_freq = []
    bg_speeches = {}
    bigrams = []
    speeches = []
    speakers = []
    for bigram in bigrams_to_speeches:
        if (Girondins[bigram] >= 10) or (Montagnards[bigram] >= 10):
            bigram_num_speakers.append(len(bigrams_to_speakers[bigram]))
            bigram_num_speeches.append(len(bigrams_to_speeches[bigram]))
            bigram_total_freq.append(Girondins[bigram] + Montagnards[bigram])
            bigrams.append(str(bigram))
            speeches.append(str(bigrams_to_speeches[bigram]))
            speakers.append(str(bigrams_to_speakers[bigram]))

    bg_num_speakers = pd.DataFrame(bigram_num_speakers,
                                   columns=['Num Speakers'])
    bg_num_speeches = pd.DataFrame(bigram_num_speeches,
                                   columns=['Num Speeches'])
    bg_total_freq = pd.DataFrame(bigram_total_freq, columns=['Total count'])
    bgs = pd.DataFrame(bigrams, columns=["Bigram"])
    speech = pd.DataFrame(speeches, columns=["Speechids"])
    speaker = pd.DataFrame(speakers, columns=["Speakers"])

    bigram_info = pd.DataFrame()
    bigram_info = pd.concat([
        bgs, speech, speaker, bg_num_speeches, bg_num_speakers, bg_total_freq
    ],
                            axis=1)
    writer = pd.ExcelWriter("bigram_info.xlsx")
    bigram_info.to_excel(writer, 'Sheet1')
    writer.save()

    w = csv.writer(open("bigrams_to_speeches_noplein.csv", "w"))
    for key, val in bigrams_to_speeches.items():
        w.writerow([key, val])

    bigrams_to_speakers_noplein_sorted = sorted(bigrams_to_speakers.items(),
                                                key=lambda x: len(x[1]),
                                                reverse=True)
    w = csv.writer(open("bigrams_to_speakers_noplein_sorted.csv", "w"))
    for item in bigrams_to_speakers_noplein_sorted:
        w.writerow([item[0], item[1]])

    # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors
    # num_speeches = 4479
    # bigram_doc_freq = pickle.load(open("bigram_doc_freq_noplein_withlimit.pickle", 'rb'))

    with open('gir_speeches_noplein_withlimit.txt', 'w') as f:
        f.write('%d' % gir_num_speeches)
    with open('mont_speeches_noplein_withlimit.txt', 'w') as f:
        f.write('%d' % mont_num_speeches)
    print num_speeches

    with open('speaker_num_speeches_withlimit.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    with open('speaker_char_count_withlimit.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    w = csv.writer(open("speaker_num_speeches_withlimit.csv", "w"))
    for key, val in speaker_num_speeches.items():
        w.writerow([key, val])

    w = csv.writer(open("speaker_char_count_withlimit.csv", "w"))
    for key, val in speaker_char_count.items():
        w.writerow([key, val])

    # Write the number of speeches and doc_frequency to memory for use in further analysis
    with open('num_speeches_noplein_withlimit.txt', 'w') as f:
        f.write('%d' % num_speeches)
    df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index")
    write_to_excel(df_doc_freq, 'doc_freq.xlsx')

    with open("bigram_doc_freq_noplein_withlimit.pickle", 'wb') as handle:
        pickle.dump(bigram_doc_freq, handle, protocol=0)

    # # Girondins = {k:v for k,v in Girondins.items() if (v >= 10)} #and (len(gir_docs[k]) > 1)}
    # # Montagnards = {k:v for k,v in Montagnards.items() if (v >= 10)} #and (len(mont_docs[k]) > 1)}

    # with open("Girondins_withlimit.pickle", 'wb') as handle:
    # 	pickle.dump(Girondins, handle, protocol = 0)
    # with open("Montagnards_withlimit.pickle", 'wb') as handle:
    # 	pickle.dump(Montagnards, handle, protocol = 0)
    # gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
    # mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

    # """with open("gir_tfidf.pickle", 'wb') as handle:
    # 	pickle.dump(gir_tfidf, handle, protocol = 0)
    # with open("mont_tfidf.pickle", 'wb') as handle:
    # 	pickle.dump(mont_tfidf, handle, protocol = 0)"""

    # # Computes the distance between the tf_idf vectors
    # #compute_distance(gir_tfidf, mont_tfidf)

    # # Stores the tf_idf vectors
    # df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient = "index")
    # #df_gir_tfidf.columns = ['Bigrams', 'tfidf']
    # write_to_excel(df_gir_tfidf, 'gir_tfidf_withlimit.xlsx')
    # df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient = "index")
    # #df_mont_tfidf.columns = ['Bigrams', 'tfidf']
    # write_to_excel(df_mont_tfidf, 'mont_tfidf_withlimit.xlsx')

    # df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
    # df_tfidf_combined = df_tfidf_combined.transpose()
    # df_tfidf_combined.columns = ["Girondins", "Montagnards"]
    # write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx')

    # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches
    # print gir_docs
    Girondins = {k: v
                 for k, v in Girondins.items()
                 if (v >= 10)}  #and (len(gir_docs[k]) > 1)}
    df_girondins = pd.DataFrame.from_dict(Girondins, orient="index")
    write_to_excel(df_girondins, "Girondins_counts_withlimit.xlsx")

    Montagnards = {k: v
                   for k, v in Montagnards.items()
                   if (v >= 10)}  #and (len(mont_docs[k]) > 1)}
    df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index")
    write_to_excel(df_montagnards, "Montagnards_counts_withlimit.xlsx")

    gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
    mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)

    # # Normalizes the vectors and computes the distance between them
    # #normalized = normalize_dicts(Girondins, Montagnards)
    # #compute_distance(normalized[0], normalized[1])

    # Stores the Girondins and Montagnards frequency vectors in the same document
    df_combined = pd.DataFrame([Girondins, Montagnards])
    df_combined = df_combined.transpose()
    df_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_combined, 'combined_frequency_withlimit.xlsx')

    df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
    df_tfidf_combined = df_tfidf_combined.transpose()
    df_tfidf_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_tfidf_combined, 'combined_tfidf_withlimit.xlsx')
Exemple #15
0
def firststep():
	year_month = []
	full_date = []
	speaker = []
	ngrams = {}

	byyearmonth = pd.DataFrame()
	bydate = pd.DataFrame()
	byspeaker = pd.DataFrame()

	raw_speeches = pickle.load(open("raw_speeches.pickle", "rb"))
	dataframe = pd.DataFrame.from_dict(raw_speeches, orient = "index")
	dataframe.columns = ['Speeches']
	speechid_to_speaker = pickle.load(open("speechid_to_speaker.pickle", "rb"))
	file = open('num_speeches.txt', 'r')
	num_speeches = int(file.read())
	doc_freq = pickle.load(open("bigram_doc_freq.pickle", "rb"))

	for speechid in raw_speeches:
		speech_bigrams = compute_ngrams(raw_speeches[speechid], 2)
		ngrams[speechid] = speech_bigrams

		yearmonth = speechid[0:7]
		year_month.append(yearmonth)

		fulldate = speechid[0:10]
		full_date.append(fulldate)

		speaker.append(speechid_to_speaker[speechid])

	
	dataframe['Year-Month'] = pd.Series(year_month).values
	dataframe['Full Date'] = pd.Series(full_date).values
	dataframe['Speaker'] = pd.Series(speaker).values
	dataframe['Speechid'] = dataframe.index

	write_to_excel(dataframe, "raw_data.xlsx")
	"""with open("ngrams.pickle", "wb") as handle:
		pickle.dump(ngrams, handle, protocol = 0)"""

	"""byyearmonth['YearMonth'] = pd.Series(year_month).values
	byyearmonth['ngrams'] = pd.Series(ngrams).values

	byyearmonth_dict = pd.Series(byyearmonth.ngrams.values, index = byyearmonth.YearMonth).to_dict()

	with open("byyearmonth_dict.pickle", 'wb') as handle:
		pickle.dump(byyearmonth_dict, handle, protocol = 0)

	
	bydate['FullDate'] = pd.Series(full_date).values
	bydate['ngrams'] = pd.Series(ngrams).values

	bydate_dict = pd.Series(bydate.ngrams.values, index = bydate.FullDate).to_dict()

	with open("bydate_dict.pickle", 'wb') as handle:
		pickle.dump(bydate_dict, handle, protocol = 0)

	
	byspeaker['Speaker'] = pd.Series(speaker).values
	byspeaker['ngrams'] = pd.Series(ngrams).values

	byspeaker_dict = pd.Series(byspeaker.ngrams.values, index = byspeaker.Speaker).to_dict()

	with open("byspeaker_dict.pickle", 'wb') as handle:
		pickle.dump(byspeaker_dict, handle, protocol = 0)"""

	# compute ngrams for each speech
	# don't need tfidf because should just add the frequency vectors not the tfidf ones
	# extract year-month
	# extract year-month-date
	# make all of those individual columns and create a pandas dataframe
	# create a function for each grouping and do a pandas groupby

	"""byyearmonth = groupby_yearmonth(dataframe)
	write_to_excel(byyearmonth, "byyearmonth.xlsx")
	byyearmonth = None
	byspeaker = groupby_speaker(dataframe)
	write_to_excel(byspeaker, "byspeaker.xlsx")
	byspeaker = None
	bydate = groupby_date(dataframe)
	write_to_excel(bydate, "bydate.xlsx")
	bydate = None"""

	groupby_yearmonth(dataframe, ngrams)
	groupby_date(dataframe, ngrams)
	groupby_speaker(dataframe, ngrams)
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker,
              Girondins, Montagnards, Plein):
    speaker_names = set()
    speaker_num_speeches = {}
    speaker_char_count = {}
    speakers_to_consider = []

    bigrams_to_speeches = collections.defaultdict()
    bigram_doc_freq = collections.defaultdict()

    gir_num_speeches = 0
    mont_num_speeches = 0
    plein_num_speeches = 0
    gir_docs = {}
    mont_docs = {}
    plein_docs = {}

    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

    for identity in raw_speeches:
        date = re.findall(date_regex, str(identity))[0]
        if (date >= "1792-09-20") and (date <= "1793-06-02"):
            # Keeps track of the number of speeches per speaker as well as the number of characters spoken by each speaker
            # To potentially establish a cutoff for analysis purposes
            speaker_name = speechid_to_speaker[identity]
            party = ""
            if speaker_name in speakers_to_consider:
                party = speakers_to_analyze.loc[speaker_name, "Party"]
            else:
                party = "Plein"
            augment(speaker_num_speeches, speaker_name)
            if speaker_name in speaker_char_count:
                speaker_char_count[speaker_name] += len(raw_speeches[identity])
            else:
                speaker_char_count[speaker_name] = len(raw_speeches[identity])
            indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
            for bigram in indv_speech_bigram:
                augment(bigram_doc_freq, bigram)

                # Maintains a list of speeches in which given bigrams are spoken in
                if bigram in bigrams_to_speeches:
                    bigrams_to_speeches[bigram].append(identity)
                else:
                    bigrams_to_speeches[bigram] = []
                    bigrams_to_speeches[bigram].append(identity)

            # Augments the relevant variables according to the party the speaker belongs to
            if party == "Girondins":
                gir_num_speeches += 1
                gir_docs = check_num_speakers(indv_speech_bigram, speaker_name,
                                              gir_docs)
                try:
                    Girondins = Girondins + indv_speech_bigram
                except NameError:
                    Girondins = indv_speech_bigram
            elif party == "Montagnards":
                mont_num_speeches += 1
                mont_docs = check_num_speakers(indv_speech_bigram,
                                               speaker_name, mont_docs)
                try:
                    Montagnards = Montagnards + indv_speech_bigram
                except NameError:
                    Montagnards = indv_speech_bigram
            # Creates a Plein category that is neither Girondins or Montagnards to better understand speakers that are not distinctly one
            # or the other
            else:
                plein_num_speeches += 1
                plein_docs = check_num_speakers(indv_speech_bigram,
                                                speaker_name, plein_docs)
                try:
                    Plein = Plein + indv_speech_bigram
                except NameError:
                    Plein = indv_speech_bigram

                #speech = speech + indv_speech_bigram

        # Stores the bigram Counter object for each individual speaker
        """pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
		with open(pickle_filename, 'wb') as handle:
			pickle.dump(speech, handle, protocol = 0)"""
    """# Stores the bigrams_to_speeches document in Excel
	df_bigrams_to_speeches = pd.DataFrame.from_dict(bigrams_to_speeches, orient = "index")
	write_to_excel(df_bigrams_to_speeches, 'bigrams_to_speeches.xlsx')"""

    # Computes the tf_idf scores for each bigram and for both the Girondins and Montaganards vectors
    num_speeches = gir_num_speeches + mont_num_speeches + plein_num_speeches
    print num_speeches

    with open('speaker_num_speeches_withplein.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    with open('speaker_char_count_withplein.pickle', 'wb') as handle:
        pickle.dump(speaker_num_speeches, handle, protocol=0)

    w = csv.writer(open("speaker_num_speeches_withplein.csv", "w"))
    for key, val in speaker_num_speeches.items():
        w.writerow([key, val])

    w = csv.writer(open("speaker_char_count_withplein.csv", "w"))
    for key, val in speaker_char_count.items():
        w.writerow([key, val])

    # Write the number of speeches and doc_frequency to memory for use in further analysis
    with open('num_speeches_withplein.txt', 'w') as f:
        f.write('%d' % num_speeches)
    df_doc_freq = pd.DataFrame.from_dict(bigram_doc_freq, orient="index")
    write_to_excel(df_doc_freq, 'doc_freq.xlsx')

    with open("bigram_doc_freq_withplein.pickle", 'wb') as handle:
        pickle.dump(bigram_doc_freq, handle, protocol=0)

    with open("Girondins_withplein.pickle", 'wb') as handle:
        pickle.dump(Girondins, handle, protocol=0)
    with open("Montagnards_withplein.pickle", 'wb') as handle:
        pickle.dump(Montagnards, handle, protocol=0)
    with open("Plein.pickle", 'wb') as handle:
        pickle.dump(Plein, handle, protocol=0)
    gir_tfidf = compute_tfidf(Girondins, num_speeches, bigram_doc_freq)
    mont_tfidf = compute_tfidf(Montagnards, num_speeches, bigram_doc_freq)
    plein_tfidf = compute_tfidf(Plein, num_speeches, bigram_doc_freq)
    """with open("gir_tfidf.pickle", 'wb') as handle:
		pickle.dump(gir_tfidf, handle, protocol = 0)
	with open("mont_tfidf.pickle", 'wb') as handle:
		pickle.dump(mont_tfidf, handle, protocol = 0)"""

    # Computes the distance between the tf_idf vectors
    #compute_distance(gir_tfidf, mont_tfidf)

    # Stores the tf_idf vectors
    df_gir_tfidf = pd.DataFrame.from_dict(gir_tfidf, orient="index")
    #df_gir_tfidf.columns = ['Bigrams', 'tfidf']
    write_to_excel(df_gir_tfidf, 'gir_tfidf_withplein.xlsx')
    df_mont_tfidf = pd.DataFrame.from_dict(mont_tfidf, orient="index")
    #df_mont_tfidf.columns = ['Bigrams', 'tfidf']
    write_to_excel(df_mont_tfidf, 'mont_tfidf_withplein.xlsx')
    df_plein_tfidf = pd.DataFrame.from_dict(plein_tfidf, orient="index")
    #df_mont_tfidf.columns = ['Bigrams', 'tfidf']
    write_to_excel(df_plein_tfidf, 'plein_tfidf.xlsx')

    df_tfidf_combined = pd.DataFrame([gir_tfidf, mont_tfidf])
    df_tfidf_combined = df_tfidf_combined.transpose()
    df_tfidf_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_tfidf_combined, 'combined_tfidf_withplein.xlsx')

    # Constrains the analysis of Girondins and Montagnards frequencies if the frequency more 3 and optionally if in a certain number of speeches
    Girondins = {k: v
                 for k, v in Girondins.items()
                 if (v >= 3)}  #and (len(gir_docs[k]) > 1)}
    df_girondins = pd.DataFrame.from_dict(Girondins, orient="index")
    write_to_excel(df_girondins, "Girondins_counts_withplein.xlsx")

    Montagnards = {k: v
                   for k, v in Montagnards.items()
                   if (v >= 3)}  #and (len(mont_docs[k]) > 1)}
    df_montagnards = pd.DataFrame.from_dict(Montagnards, orient="index")
    write_to_excel(df_montagnards, "Montagnards_counts_withplein.xlsx")

    # Normalizes the vectors and computes the distance between them
    #normalized = normalize_dicts(Girondins, Montagnards)
    #compute_distance(normalized[0], normalized[1])

    # Stores the Girondins and Montagnards frequency vectors in the same document
    df_combined = pd.DataFrame([Girondins, Montagnards])
    df_combined = df_combined.transpose()
    df_combined.columns = ["Girondins", "Montagnards"]
    write_to_excel(df_combined, 'combined_frequency.xlsx')
Exemple #17
0
def aggregate(speakers_to_analyze, raw_speeches, speechid_to_speaker,
              Girondins, Montagnards):
    speaker_names = set()
    speakers_to_consider = []
    gir_num_speeches = 0
    mont_num_speeches = 0
    bigrams_speeches = collections.defaultdict()
    gir_doc_freq = collections.defaultdict()
    mont_doc_freq = collections.defaultdict()
    for speaker in speakers_to_analyze.index.values:
        speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))
    for speaker_name in speakers_to_consider:
        print speaker_name
        party = speakers_to_analyze.loc[speaker_name, "Party"]
        speech = Counter()
        for identity in raw_speeches:
            date = re.findall(date_regex, str(identity))[0]
            if (date >= "1792-09-20") and (date <= "1793-06-02") and (
                    speaker_name == speechid_to_speaker[identity]):
                indv_speech_ngram = compute_ngrams(raw_speeches[identity], 2)
                for bigram in indv_speech_ngram:
                    if bigram in bigrams_speeches:
                        bigrams_speeches[bigram].append(identity)
                    else:
                        bigrams_speeches[bigram] = []
                        bigrams_speeches[bigram].append(identity)
                    if bigram in doc_freq:
                        doc_freq[bigram] = doc_freq[bigram] + 1
                    else:
                        doc_freq[bigram] = 1
                if party == "Girondins":
                    gir_num_speeches += 1
                    gir_doc_freq = check_num_speakers(indv_speech_ngram,
                                                      speaker_name,
                                                      gir_doc_freq)
                    try:
                        Girondins = Girondins + indv_speech_ngram
                    except NameError:
                        Girondins = indv_speech_ngram
                else:
                    mont_num_speeches += 1
                    mont_doc_freq = check_num_speakers(indv_speech_ngram,
                                                       speaker_name,
                                                       mont_doc_freq)
                    try:
                        Montagnards = Montagnards + indv_speech_ngram
                    except NameError:
                        Montagnards = indv_speech_ngram
                speech = speech + indv_speech_ngram
        #speaker_ngrams = compute_ngrams(speech, 2)
        pickle_filename = "../Speakers/" + speaker_name + "_ngrams.pickle"
        with open(pickle_filename, 'wb') as handle:
            pickle.dump(speech, handle, protocol=0)

    with open('bigrams_to_speeches.csv', 'wb') as outfile:
        writer = csv.writer(outfile)
        for key, val in bigrams_speeches.items():
            writer.writerow([key, val])

    # Creates the combined frequency document when not limiting more than three occurrences for purposes of use
    # when creating the frequency map
    print_to_excel(Girondins, Montagnards, 'combined_frequency_all.xlsx')

    Girondins = {k: v
                 for k, v in Girondins.items()
                 if (v >= 3)}  #and (len(gir_doc_freq[k]) > 1)}
    print_to_csv(Girondins, "Girondins_counts.csv")

    Montagnards = {k: v
                   for k, v in Montagnards.items()
                   if (v >= 3)}  #and (len(mont_doc_freq[k]) > 1)}
    print_to_csv(Montagnards, "Montagnards_counts.csv")

    print_to_excel(Girondins, Montagnards, 'combined_frequency.xlsx')

    num_speeches = gir_num_speeches + mont_num_speeches
    gir_tfidf = compute_tfidf(Girondins, num_speeches)
    mont_tfidf = compute_tfidf(Montagnards, num_speeches)

    #compute_distance(gir_tfidf, mont_tfidf)

    print_to_csv(gir_tfidf, 'gir_tfidf.csv')
    print_to_csv(mont_tfidf, 'mont_tfidf.csv')
    print_to_excel(gir_tfidf, mont_tfidf, 'combined_tfidf.xlsx')

    normalized = normalize_dicts(Girondins, Montagnards)
    compute_distance(normalized[0], normalized[1])
Exemple #18
0
def build_vectors(raw_speeches, speechid_to_speaker, speaker_list, speakers_to_analyze, gir_tfidf, mont_tfidf, num_speeches, doc_freq):
	speaker_ngrams = {}
	speakers_to_consider = []
	speaker_distances = collections.defaultdict()
	chronology = collections.defaultdict(dict)

	for speaker in speakers_to_analyze.index.values:
		speakers_to_consider.append(remove_diacritic(speaker).decode('utf-8'))

	for identity in raw_speeches:
		date = re.findall(date_regex, str(identity))[0]
		speaker_name = speechid_to_speaker[identity]
		if (date >= "1792-09-20") and (date <= "1793-06-02") and (speaker_name in speakers_to_consider):
			indv_speech_bigram = compute_ngrams(raw_speeches[identity], 2)
			if speaker_name in speaker_ngrams:
				speaker_ngrams[speaker_name] = speaker_ngrams[speaker_name] + indv_speech_bigram
			else:
				speaker_ngrams[speaker_name] = indv_speech_bigram
		"""
		if speaker_name in chronology:
			pairing = chronology[speaker_name]
			for bigram in indv_speech_bigram:
				if bigram in pairing:
					pairing[bigram].append([identity, indv_speech_bigram[bigram]])
				else:
					pairing[bigram] = [identity, indv_speech_bigram[bigram]]
		else:
			chronology[speaker_name] = {}
			pairing = chronology[speaker_name]
			for bigram in indv_speech_bigram:
				pairing[bigram] = []
				# stores the unique speechid alongside the number of times that bigram is said in that speech for each bigram
				pairing[bigram] = [identity, indv_speech_bigram[bigram]]"""

	
	## Need tf-idf vectors for gir and mont
	## Need the doc_freq for the previous calcuations
	## compute tf-idf for individual speakers
	## compute cosine distance based on those vectors (dot product over length of vectors)
	## compute cosine similarity between the difference between the two group vectors (subtract from each other)
	## A - B, if positive more like A, if negative more like B

	## create tf vector for each speech and store that so can just add
	## Separately store single idf vector

	#########

	gir_dict = convert_keys_to_string(gir_tfidf)
	mont_dict = convert_keys_to_string(mont_tfidf)
	doc_freq_dict = convert_keys_to_string(doc_freq)
	gir_mont_diff = compute_difference(gir_dict, mont_dict)
	#gir_dict = gir_tfidf
	#print gir_dict
	#mont_dict = mont_tfidf
	for speaker in speaker_ngrams:
		speaker_dict = convert_keys_to_string(speaker_ngrams[speaker])
		to_compare = compute_tfidf(speaker_dict, num_speeches, doc_freq_dict)
		gir_dist = cosine_similarity(gir_dict, to_compare)
		mont_dist = cosine_similarity(mont_dict, to_compare)
		# Need to actually compute the distance
		gir_mont_diff_dist = cosine_similarity(gir_mont_diff, to_compare)
		speaker_distances[speaker] = [gir_dist, mont_dist, gir_mont_diff_dist]

	"""
	#speaker_dict = {(str(k),v) for k,v in speaker_ngrams['Francois Chabot']}
	speaker_dict = convert_keys_to_string(speaker_ngrams['Francois Chabot'])
	to_compare = compute_tfidf(speaker_dict, num_speeches, doc_freq)
	gir_dist = cosine_similarity(gir_dict, to_compare)
	df = pd.DataFrame([to_compare, gir_dict])
	df = df.transpose()
	write_to_excel(df, "Francois Chabot Test.xlsx")"""

	
	"""for speaker in speaker_ngrams:
		#to_compare = {k:v for k,v in speaker_ngrams[speaker].items() if (v >= 3)}
		to_compare = speaker_ngrams[speaker]
		gir_dict = gir_tfidf
		mont_dict = mont_tfidf
		gir_normalized = normalize_dicts(to_compare, gir_dict)
		gir_dist = 	compute_distance(gir_normalized[0], gir_normalized[1])
		to_compare = speaker_ngrams[speaker]
		mont_normalized = normalize_dicts(to_compare, mont_dict)
		mont_dist = compute_distance(mont_normalized[0], mont_normalized[1])
		speaker_distances[speaker] = [gir_dist, mont_dist]"""

	

	
	pickle_filename_3 = "speaker_ngrams.pickle"
	with open(pickle_filename_3, 'wb') as handle:
		pickle.dump(speaker_ngrams, handle, protocol = 0)

	df = pd.DataFrame.from_dict(speaker_distances)
	df = df.transpose()
	df.columns = ["dist to Girondins", "dist to Montagnards", "dist to difference"]
	filename = "freq_dist_map.xlsx"
	writer = pd.ExcelWriter(filename)
	df.to_excel(writer, 'Sheet1')
	writer.save()

	pickle_filename = "freq_dist.pickle"
	with open(pickle_filename, 'wb') as handle:
		pickle.dump(speaker_distances, handle, protocol = 0)

	"""df2 = pd.DataFrame.from_dict(chronology)