def subject_vs_object_pronoun_freqs(corp): ''' Takes in a Corpus of novels Returns a tuple of two dictionaries, one male and female Each dictionary maps each Novel in the corpus to the proportion of the pronouns of the specified gender in that novel that are subject pronouns #TODO: add doctests :param corp: Corpus :return: tuple of two dictionaries (male, female) >>> subject_vs_object_pronoun_freqs(Corpus('test_corpus')) ({<Novel (aanrud_longfrock)>: 0.793233082706767, <Novel (abbott_flatlandromance)>: 0.6741573033707865, <Novel (abbott_indiscreetletter)>: 0.7906976744186047, <Novel (adams_fighting)>: 0.7184527584020292, <Novel (alcott_josboys)>: 0.6330049261083744, <Novel (alcott_littlemen)>: 0.6451612903225807, <Novel (alcott_littlewomen)>: 0.6577563540753725, <Novel (alden_chautauqua)>: 0.7577030812324931, <Novel (austen_emma)>: 0.7086120401337792, <Novel (austen_persuasion)>: 0.6739130434782609}, {<Novel (aanrud_longfrock)>: 0.5376532399299474, <Novel (abbott_flatlandromance)>: 0.17543859649122806, <Novel (abbott_indiscreetletter)>: 0.4424242424242424, <Novel (adams_fighting)>: 0.43485915492957744, <Novel (alcott_josboys)>: 0.3862487360970678, <Novel (alcott_littlemen)>: 0.4343501326259947, <Novel (alcott_littlewomen)>: 0.4124569980083288, <Novel (alden_chautauqua)>: 0.5461432506887053, <Novel (austen_emma)>: 0.4836730221345606, <Novel (austen_persuasion)>: 0.4872013651877133}) ''' try: if (not corp.load_test_corpus): relative_freq_male_sub_v_ob = common.load_pickle( f'{corp.corpus_name}_sub_v_ob_pronoun_freq_male') relative_freq_female_sub_v_ob = common.load_pickle( f'{corp.corpus_name}_sub_v_ob_pronoun_freq_female') return (relative_freq_male_sub_v_ob, relative_freq_female_sub_v_ob) except IOError: pass relative_freq_male_subject = {} relative_freq_female_subject = {} relative_freq_male_object = {} relative_freq_female_object = {} for book in corp.novels: he = book.get_word_freq('he') him = book.get_word_freq('him') she = book.get_word_freq('she') her = book.get_word_freq('her') temp_dict_male = {'subject': he, 'object': him} temp_dict_female = {'subject': she, 'object': her} temp_dict_male = get_comparative_word_freq(temp_dict_male) temp_dict_female = get_comparative_word_freq(temp_dict_female) relative_freq_male_subject[book] = temp_dict_male['subject'] relative_freq_female_subject[book] = temp_dict_female['subject'] relative_freq_male_object[book] = temp_dict_male['object'] relative_freq_female_object[book] = temp_dict_female['object'] book.text = '' book._word_counts_counter = None if (not corp.load_test_corpus): common.store_pickle(relative_freq_male_subject, f'{corp.corpus_name}_sub_v_ob_pronoun_freq_male') common.store_pickle(relative_freq_female_subject, f'{corp.corpus_name}_sub_v_ob_pronoun_freq_female') result_tuple = (relative_freq_male_subject, relative_freq_female_subject) return result_tuple
def subject_pronouns_gender_comparison(corp, subject_gender): ''' Takes in a Corpus of novels and a gender. The gender determines whether the male frequency or female frequency will be returned Returns a dictionary of each novel in the Corpus mapped to the portion of the subject pronouns in the book that are of the specified gender :param corp: Corpus :param subject_gender: string 'male' or string 'female' :return: dictionary >>> subject_pronouns_gender_comparison(Corpus('test_corpus'), 'male') {<Novel (aanrud_longfrock)>: 0.2557575757575758, <Novel (abbott_flatlandromance)>: 0.923076923076923, <Novel (abbott_indiscreetletter)>: 0.582857142857143, <Novel (adams_fighting)>: 0.8210144927536231, <Novel (alcott_josboys)>: 0.5736607142857142, <Novel (alcott_littlemen)>: 0.6812652068126521, <Novel (alcott_littlewomen)>: 0.39719502513892563, <Novel (alden_chautauqua)>: 0.2543488481429243, <Novel (austen_emma)>: 0.4343926191696566, <Novel (austen_persuasion)>: 0.45696623870660963} >>> subject_pronouns_gender_comparison(Corpus('test_corpus'), 'female') {<Novel (aanrud_longfrock)>: 0.7442424242424243, <Novel (abbott_flatlandromance)>: 0.07692307692307691, <Novel (abbott_indiscreetletter)>: 0.4171428571428572, <Novel (adams_fighting)>: 0.17898550724637682, <Novel (alcott_josboys)>: 0.4263392857142857, <Novel (alcott_littlemen)>: 0.31873479318734793, <Novel (alcott_littlewomen)>: 0.6028049748610743, <Novel (alden_chautauqua)>: 0.7456511518570758, <Novel (austen_emma)>: 0.5656073808303435, <Novel (austen_persuasion)>: 0.5430337612933904} ''' if not(subject_gender == 'male' or subject_gender == 'female'): raise ValueError('subject_gender must be \'male\' or \'female\'') try: if (not corp.load_test_corpus): relative_freq_male_subject = common.load_pickle( f'{corp.corpus_name}_subject_pronoun_freq_male') relative_freq_female_subject = common.load_pickle( f'{corp.corpus_name}_subject_pronoun_freq_female') if subject_gender == 'male': return relative_freq_male_subject else: return relative_freq_female_subject except IOError: pass relative_freq_female_sub = {} relative_freq_male_sub = {} for book in corp.novels: he = book.get_word_freq('he') she = book.get_word_freq('she') relative_freq_female_sub[book] = (she)/(he+she) relative_freq_male_sub[book] = (he)/(he+she) book.text = '' book._word_counts_counter = None if (not corp.load_test_corpus): common.store_pickle(relative_freq_female_sub, f'{corp.corpus_name}_subject_pronoun_freq_female') common.store_pickle(relative_freq_male_sub, f'{corp.corpus_name}_subject_pronoun_freq_male') if subject_gender == 'male': return relative_freq_male_sub elif subject_gender == 'female': return relative_freq_female_sub else: raise ValueError('subject_gender must be \'male\' or \'female\'')
def store_raw_results(results, corpus_name): try: common.load_pickle("pronoun_adj_raw_analysis_" + corpus_name) x = input("results already stored. overwrite previous analysis? (y/n)") if x == 'y': common.store_pickle(results, "pronoun_adj_raw_analysis_" + corpus_name) else: pass except IOError: common.store_pickle(results, "pronoun_adj_raw_analysis_" + corpus_name)
def get_p_vals(corpus_name): """ ANOVA test for independence of: - male vs female authors' median distance between female instances - UK vs. US vs. other country authors' median distance between female instances - Date ranges authors' median distance between female instances :param corpus_name: :return: data-frame with 3 p-values, one for each category comparison """ try: r1 = common.load_pickle("median_instance_distances_by_location_" + corpus_name) r2 = common.load_pickle("median_instance_distances_by_author_gender_" + corpus_name) r3 = common.load_pickle("median_instance_distances_by_date_" + corpus_name) except IOError: print("results not available") names = ["location", "male_vs_female_authors", "date"] median_distance_between_female_pronouns_pvals = [] location_medians = [] author_gender_medians = [] date_medians = [] med = [location_medians, author_gender_medians, date_medians] res = [r1, r2, r3] for r in range(0, 3): for key in list(res[r].keys()): medians = [] for el in list(res[r][key]): medians.append(el[1]) med[r].append(medians) _, location_pval = stats.f_oneway(*location_medians) _, author_gender_pval = stats.f_oneway(*author_gender_medians) _, date_pval = stats.f_oneway(*date_medians) median_distance_between_female_pronouns_pvals = [ location_pval, author_gender_pval, date_pval ] return pnds.DataFrame({ "names": names, "pvals": median_distance_between_female_pronouns_pvals, })
def books_pronoun_freq(corp): ''' Counts male and female pronouns for every book and finds their relative frequencies per book Outputs dictionary mapping novel object to the relative frequency of female pronouns in that book :param: Corpus object :return: dictionary with data organized by groups >>> books_pronoun_freq(Corpus('test_corpus')) {<Novel (aanrud_longfrock)>: 0.7623169107856191, <Novel (abbott_flatlandromance)>: 0.14321608040201003, <Novel (abbott_indiscreetletter)>: 0.4166666666666667, <Novel (adams_fighting)>: 0.1898395721925134, <Novel (alcott_josboys)>: 0.42152086422368146, <Novel (alcott_littlemen)>: 0.3111248200699157, <Novel (alcott_littlewomen)>: 0.6196978175713487, <Novel (alden_chautauqua)>: 0.7518623169791935, <Novel (austen_emma)>: 0.5662100456621004, <Novel (austen_persuasion)>: 0.5305111461382571} ''' try: if (not corp.load_test_corpus): relative_freq_male = common.load_pickle(f'{corp.corpus_name}_pronoun_freq_male') relative_freq_female = common.load_pickle(f'{corp.corpus_name}_pronoun_freq_female') return relative_freq_female except IOError: pass relative_freq_male = {} relative_freq_female = {} for book in corp.novels: he = book.get_word_freq('he') him = book.get_word_freq('him') his = book.get_word_freq('his') male = he + him + his she = book.get_word_freq('she') her = book.get_word_freq('her') hers = book.get_word_freq('hers') female = she + her + hers temp_dict = {'male': male, 'female': female} temp_dict = get_comparative_word_freq(temp_dict) relative_freq_male[book] = temp_dict['male'] relative_freq_female[book] = temp_dict['female'] book.text = '' book._word_counts_counter = None if (not corp.load_test_corpus): common.store_pickle(relative_freq_male, f'{corp.corpus_name}_pronoun_freq_male') common.store_pickle(relative_freq_female, f'{corp.corpus_name}_pronoun_freq_female') return (relative_freq_female)
def male_VS_female_analysis_dunning(corpus_name): ''' tests word distinctiveness of shared words between male and female corpora using dunning Prints out the most distinctive terms overall as well as grouped by verbs, adjectives etc. :return: dict ''' # By default, try to load precomputed results. Only calculate if no stored results are # available. pickle_filename = f'dunning_male_vs_female_authors_{corpus_name}' try: results = load_pickle(pickle_filename) except IOError: c = Corpus(corpus_name) m_corpus = c.filter_by_gender('male') f_corpus = c.filter_by_gender('female') wordcounter_male = m_corpus.get_wordcount_counter() wordcounter_female = f_corpus.get_wordcount_counter() results = dunning_total(wordcounter_male, wordcounter_female, filename_to_pickle=pickle_filename) for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=50, part_of_speech_to_include=group)
def male_vs_female_authors_analysis_dunning(corpus_name, display_results=False): ''' tests word distinctiveness of shared words between male and female authors using dunning If called with display_results=True, prints out the most distinctive terms overall as well as grouped by verbs, adjectives etc. Returns a dict of all terms in the corpus mapped to the dunning data for each term :return:dict ''' # By default, try to load precomputed results. Only calculate if no stored results are # available. pickle_filename = f'dunning_male_vs_female_authors_{corpus_name}' try: results = load_pickle(pickle_filename) except IOError: c = Corpus(corpus_name) m_corpus = c.filter_by_gender('male') f_corpus = c.filter_by_gender('female') wordcounter_male = m_corpus.get_wordcount_counter() wordcounter_female = f_corpus.get_wordcount_counter() results = dunning_total(wordcounter_female, wordcounter_male, filename_to_pickle=pickle_filename) if display_results: for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=20, corpus1_display_name='Fem Author', corpus2_display_name='Male Author', part_of_speech_to_include=group) return results
def get_highest_distances(corpus_name, num): """ Returns 3 lists. - Novels with the largest median male instance distance - Novels with the largest median female instance distance - Novels with the largest difference between median male & median female instance distances each list contains tuples, where each tuple has a novel and the median male/female/difference instance distance :param corpus_name: :param num: number of top distances to get :return: 3 lists of tuples. """ try: raw_results = common.load_pickle("instance_distance_raw_analysis_" + corpus_name) except IOError: print("No raw results available for this corpus") male_medians = [] female_medians = [] difference_medians = [] for novel in list(raw_results.keys()): male_medians.append((raw_results[novel]['male']['median'], novel)) female_medians.append((raw_results[novel]['female']['median'], novel)) difference_medians.append( (raw_results[novel]['difference']['median'], novel)) male_top = sorted(male_medians, reverse=True)[0:num] female_top = sorted(female_medians, reverse=True)[0:num] diff_top = sorted(difference_medians)[0:num] return male_top, female_top, diff_top
def compare_word_association_between_corpus_analysis_dunning(word, corpus1=None, corpus1_name=None, corpus2=None, corpus2_name=None, use_word_window=False, word_window=None): """ Uses Dunning analysis to compare words associated with word between corpuses. If a corpus and corpus_name are passsed in, then the analysis will use the corpus but name the file after corpus_name. If no corpus is passed in but a corpus_name is, then the method will try to create a Corpus by corpus = Corpus(corpus_name). If neither a corpus nor a corpus_name is passed in, analysis is simply done on the Gutenberg corpus. :param word1: str :param corpus: Corpus :param corpus_name: str :return: dict """ if corpus1: if not corpus1_name: corpus1_name = corpus1.corpus_name else: if not corpus1_name: corpus1_name = "gutenberg" corpus1 = Corpus(corpus1_name) if corpus2: if not corpus2_name: corpus2_name = corpus2.corpus_name else: if not corpus2_name: corpus2_name = "gutenberg" corpus2 = Corpus(corpus2_name) pickle_filename = (f'dunning_{word}_associated_words_{corpus1_name}_vs_{corpus2_name}_in_' f'{corpus1.corpus_name}') if use_word_window: pickle_filename+= f'_word_window_{word_window}' try: results = load_pickle(pickle_filename) except IOError: print("Precalculated result not available. Running analysis now...") corpus1_counter = Counter() corpus2_counter = Counter() for novel in corpus1.novels: if use_word_window: get_word_windows(self, search_terms, window_size=word_window) else: corpus1_counter.update(novel.words_associated(word)) for novel in corpus2.novels: if use_word_window: get_word_windows(self, search_terms, window_size=word_window) else: corpus2_counter.update(novel.words_associated(word)) results = dunning_total(corpus1_counter, corpus2_counter, filename_to_pickle=pickle_filename) for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=20, corpus1_display_name=f'{corpus1_name}. {word}', corpus2_display_name=f'{corpus2_name}. {word}', part_of_speech_to_include=group) return results
def he_vs_she_associations_analysis_dunning(corpus_name): """ Uses Dunning analysis to compare words associated with 'he' vs words associated with 'she' in the Corpus passed in as the parameter. The corpus_name parameter is if you want to name the file something other than Gutenberg (e.g. Gutenberg_female_authors) :param corpus_name: str """ corpus = Corpus(corpus_name) pickle_filename = f'dunning_he_vs_she_associated_words_{corpus_name}' try: results = load_pickle(pickle_filename) except IOError: he_counter = Counter() she_counter = Counter() for novel in corpus.novels: he_counter.update(novel.words_associated("he")) she_counter.update(novel.words_associated("she")) results = dunning_total(she_counter, he_counter, filename_to_pickle=pickle_filename) for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=20, corpus1_display_name='she...', corpus2_display_name='he..', part_of_speech_to_include=group)
def compare_word_association_in_corpus_analysis_dunning(word1, word2, corpus=None, corpus_name=None): """ Uses Dunning analysis to compare words associated with word1 vs words associated with word2 in the Corpus passed in as the parameter. If a corpus and corpus_name are passsed in, then the analysis will use the corpus but name the file after corpus_name. If no corpus is passed in but a corpus_name is, then the method will try to create a Corpus by corpus = Corpus(corpus_name). If neither a corpus nor a corpus_name is passed in, analysis is simply done on the Gutenberg corpus. :param word1: str :param word2: str :param corpus: Corpus :param corpus_name: str :return: dict """ if corpus: if not corpus_name: corpus_name = corpus.corpus_name else: if not corpus_name: corpus_name = "gutenberg" corpus = Corpus(corpus_name) pickle_filename = f'dunning_{word1}_vs_{word2}_associated_words_{corpus_name}' try: results = load_pickle(pickle_filename) except IOError: try: pickle_filename = f'dunning_{word2}_vs_{word1}_associated_words_{corpus_name}' results = load_pickle(pickle_filename) except: word1_counter = Counter() word2_counter = Counter() for novel in corpus.novels: word1_counter.update(novel.words_associated(word1)) word2_counter.update(novel.words_associated(word2)) results = dunning_total(word1_counter, word2_counter, filename_to_pickle=pickle_filename) for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=50, part_of_speech_to_include=group) return results
def get_top_adj(corpus_name, num): male_adj = [] female_adj = [] data = common.load_pickle("pronoun_adj_final_results_"+corpus_name) for adj, val in data.items(): male_adj.append((val[0]-val[1], adj)) female_adj.append((val[1]-val[0], adj)) male_top = sorted(male_adj, reverse=True)[0:num] female_top = sorted(female_adj, reverse=True)[0:num] return male_top, female_top
def run_analysis(corpus_name): """ Run instance distance analyses on a particular corpus and saves results as pickle files. Comment out sections of code or analyses that have already been run or are unnecessary. :param corpus_name: :return: """ print('loading corpus') corpus = Corpus(corpus_name) novels = corpus.novels print('running analysis') results = run_distance_analysis(novels) print('storing results') store_raw_results(results, corpus_name) r = common.load_pickle("instance_distance_raw_analysis_" + corpus_name) r2 = results_by_location(r, "mean") r3 = results_by_author_gender(r, "mean") r4 = results_by_date(r, "median") r5 = results_by_location(r, "median") r6 = results_by_author_gender(r, "median") r7 = results_by_date(r, "median") common.store_pickle(r2, "mean_instance_distances_by_location_" + corpus_name) common.store_pickle( r3, "mean_instance_distances_by_author_gender_" + corpus_name) common.store_pickle(r4, "mean_instance_distances_by_date_" + corpus_name) common.store_pickle(r5, "median_instance_distances_by_location_" + corpus_name) common.store_pickle( r6, "median_instance_distances_by_author_gender_" + corpus_name) common.store_pickle(r7, "median_instance_distances_by_date_" + corpus_name) pvals = get_p_vals("gutenberg") common.store_pickle(pvals, "instance_distance_comparison_pvals") male_top_twenty, female_top_twenty, diff_top_twenty = get_highest_distances( "gutenberg", 20) top_twenties = { 'male_pronoun_top_twenty': male_top_twenty, 'female_pronoun_top_twenty': female_top_twenty, "difference_top_twenty": diff_top_twenty } common.store_pickle(top_twenties, "instance_distance_top_twenties")
def run_analysis(corpus_name): print("loading corpus", corpus_name) corpus = Corpus(corpus_name) novels = corpus.novels print("running analysis") results = run_adj_analysis(novels) print("storing results") store_raw_results(results, corpus_name) r = common.load_pickle("pronoun_adj_raw_analysis"+corpus_name) m = merge_raw_results(r) final = get_overlapping_adjectives_raw_results(m) common.store_pickle(final, "pronoun_adj_final_results"+corpus_name) #Comment out pprint for large databases where it's not practical to print out results pprint(final)
def male_VS_female_analysis_dunning(corpus_name, display_data = False): ''' tests word distinctiveness of shared words between male and female corpora using dunning Prints out the most distinctive terms overall as well as grouped by verbs, adjectives etc. :return: dict ''' # By default, try to load precomputed results. Only calculate if no stored results are # available. pickle_filename = f'dunning_male_vs_female_chars_{corpus_name}' try: results = load_pickle(pickle_filename) except IOError: c = Corpus(corpus_name) m_corpus = c.filter_by_gender('male') f_corpus = c.filter_by_gender('female') from collections import Counter wordcounter_male = Counter() wordcounter_female = Counter() for novel in m_corpus: wordcounter_male += novel.words_associated('he') for novel in f_corpus: wordcounter_female += novel.words_associated('he') # wordcounter_male = m_corpus.get_wordcount_counter() # wordcounter_female = f_corpus.get_wordcount_counter() results = dunning_total(wordcounter_male, wordcounter_female, filename_to_pickle=pickle_filename) if display_data: for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=20, corpus1_display_name='Fem Author', corpus2_display_name='Male Author', part_of_speech_to_include=group) return results
def pickle(novel, parser): """ This function returns a pickled tree :param novel: Novel we are interested in :param parser: Stanford parser object :return: tree in pickle format >>> tree = load_pickle(f'dep_tree_aanrud_longfrock') >>> tree == None False """ try: tree = load_pickle(f'dep_tree_{str(novel)}') except (IOError, FileNotFoundError): sentences = sent_tokenize(novel.text.lower().replace("\n", " ")) he_she_sentences = [] for sentence in sentences: add_sentence = False words = [word for word in word_tokenize(sentence)] for word in words: if word == "he" or word == "she" or word == "him" or word == "her": add_sentence = True if add_sentence: he_she_sentences.append(sentence) sentences = he_she_sentences result = parser.raw_parse_sents(sentences) # dependency triples of the form ((head word, head tag), rel, (dep word, dep tag)) # link defining dependencies: https://nlp.stanford.edu/software/dependencies_manual.pdf tree = list(result) tree_list = [] i = 0 for sentence in tree: tree_list.append([]) triples = list(next(sentence).triples()) for triple in triples: tree_list[i].append(triple) i += 1 tree = tree_list store_pickle(tree, f'dep_tree_{str(novel)}') return tree
def run_analysis(corpus_name): """ print("loading corpus", corpus_name) corpus = Corpus(corpus_name) novels = corpus.novels print("running analysis") results = run_adj_analysis(novels) print("storing results") store_raw_results(results, corpus_name) print("loading results") r = common.load_pickle("pronoun_adj_raw_analysis_"+corpus_name) print("merging and getting final results") m = merge_raw_results(r) print("getting final results") final = get_overlapping_adjectives_raw_results(m) print("storing final results") common.store_pickle(final, "pronoun_adj_final_results_"+corpus_name) #Comment out pprint for large databases where it's not practical to print out results #pprint(final) """ r = common.load_pickle("pronoun_adj_raw_analysis_" + corpus_name) print("getting results by location") r2 = results_by_location(r) print("storing 1") common.store_pickle(r2, "pronoun_adj_by_location") print("getting results by author gender") r3 = results_by_author_gender(r) print("storing 2") common.store_pickle(r3, "pronoun_adj_by_author_gender") print("getting results by date") r4 = results_by_date(r) print("storing 3") common.store_pickle(r4, "pronoun_adj_by_date") print("DONE")