def male_vs_female_authors_analysis_dunning(corpus_name, display_results=False): ''' tests word distinctiveness of shared words between male and female authors using dunning If called with display_results=True, prints out the most distinctive terms overall as well as grouped by verbs, adjectives etc. Returns a dict of all terms in the corpus mapped to the dunning data for each term :return:dict ''' # By default, try to load precomputed results. Only calculate if no stored results are # available. pickle_filename = f'dunning_male_vs_female_authors_{corpus_name}' try: results = load_pickle(pickle_filename) except IOError: c = Corpus(corpus_name) m_corpus = c.filter_by_gender('male') f_corpus = c.filter_by_gender('female') wordcounter_male = m_corpus.get_wordcount_counter() wordcounter_female = f_corpus.get_wordcount_counter() results = dunning_total(wordcounter_female, wordcounter_male, filename_to_pickle=pickle_filename) if display_results: for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=20, corpus1_display_name='Fem Author', corpus2_display_name='Male Author', part_of_speech_to_include=group) return results
def compare_word_association_between_corpus_analysis_dunning(word, corpus1=None, corpus1_name=None, corpus2=None, corpus2_name=None, use_word_window=False, word_window=None): """ Uses Dunning analysis to compare words associated with word between corpuses. If a corpus and corpus_name are passsed in, then the analysis will use the corpus but name the file after corpus_name. If no corpus is passed in but a corpus_name is, then the method will try to create a Corpus by corpus = Corpus(corpus_name). If neither a corpus nor a corpus_name is passed in, analysis is simply done on the Gutenberg corpus. :param word1: str :param corpus: Corpus :param corpus_name: str :return: dict """ if corpus1: if not corpus1_name: corpus1_name = corpus1.corpus_name else: if not corpus1_name: corpus1_name = "gutenberg" corpus1 = Corpus(corpus1_name) if corpus2: if not corpus2_name: corpus2_name = corpus2.corpus_name else: if not corpus2_name: corpus2_name = "gutenberg" corpus2 = Corpus(corpus2_name) pickle_filename = (f'dunning_{word}_associated_words_{corpus1_name}_vs_{corpus2_name}_in_' f'{corpus1.corpus_name}') if use_word_window: pickle_filename+= f'_word_window_{word_window}' try: results = load_pickle(pickle_filename) except IOError: print("Precalculated result not available. Running analysis now...") corpus1_counter = Counter() corpus2_counter = Counter() for novel in corpus1.novels: if use_word_window: get_word_windows(self, search_terms, window_size=word_window) else: corpus1_counter.update(novel.words_associated(word)) for novel in corpus2.novels: if use_word_window: get_word_windows(self, search_terms, window_size=word_window) else: corpus2_counter.update(novel.words_associated(word)) results = dunning_total(corpus1_counter, corpus2_counter, filename_to_pickle=pickle_filename) for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=20, corpus1_display_name=f'{corpus1_name}. {word}', corpus2_display_name=f'{corpus2_name}. {word}', part_of_speech_to_include=group) return results
def male_VS_female_analysis_dunning(corpus_name): ''' tests word distinctiveness of shared words between male and female corpora using dunning Prints out the most distinctive terms overall as well as grouped by verbs, adjectives etc. :return: dict ''' # By default, try to load precomputed results. Only calculate if no stored results are # available. pickle_filename = f'dunning_male_vs_female_authors_{corpus_name}' try: results = load_pickle(pickle_filename) except IOError: c = Corpus(corpus_name) m_corpus = c.filter_by_gender('male') f_corpus = c.filter_by_gender('female') wordcounter_male = m_corpus.get_wordcount_counter() wordcounter_female = f_corpus.get_wordcount_counter() results = dunning_total(wordcounter_male, wordcounter_female, filename_to_pickle=pickle_filename) for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=50, part_of_speech_to_include=group)
def male_characters_author_gender_differences(corpus_name): """ Compares how male authors versus female authors write male characters by looking at the words that follow 'he' :param corpus_name: :return: """ male_corpus = Corpus(corpus_name).filter_by_gender('male') female_corpus = Corpus(corpus_name).filter_by_gender('female') compare_word_association_between_corpus_analysis_dunning(word='he', corpus1=female_corpus, corpus1_name='female aut', corpus2=male_corpus, corpus2_name='male aut')
def money_author_gender_differences(corpus_name): """ Compares how male authors versus female authors refer to money by looking at the words before and after money' :param corpus_name: :return: """ male_corpus = Corpus(corpus_name).filter_by_gender('male') female_corpus = Corpus(corpus_name).filter_by_gender('female') compare_word_association_between_corpus_analysis_dunning(word=['money','dollars', 'pounds', 'euros', 'dollar', 'pound','euro', 'wealth', 'income'], corpus1=female_corpus, corpus1_name='female aut', corpus2=male_corpus, corpus2_name='male aut')
def america_author_gender_differences(corpus_name): """ Compares how American male authors versus female authors refer to America by looking at the words that follow 'America' :param corpus_name: :return: """ male_corpus = Corpus(corpus_name).filter_by_gender('male') female_corpus = Corpus(corpus_name).filter_by_gender('female') compare_word_association_between_corpus_analysis_dunning(word='America', corpus1=female_corpus, corpus1_name='female aut', corpus2=male_corpus, corpus2_name='male aut')
def male_vs_female_authors_analysis_dunning_lesser(): ''' tests word distinctiveness of shared words between male and female corpora using dunning :return: dictionary of common shared words and their distinctiveness ''' c = Corpus('test_corpus') m_corpus = c.filter_by_gender('male') f_corpus = c.filter_by_gender('female') wordcounter_male = m_corpus.get_wordcount_counter() wordcounter_female = f_corpus.get_wordcount_counter() results = dunning_total(wordcounter_male, wordcounter_female) print("women's top 10: ", results[0:10]) print("men's top 10: ", list(reversed(results[-10:]))) return results
def he_vs_she_associations_analysis_dunning(corpus_name): """ Uses Dunning analysis to compare words associated with 'he' vs words associated with 'she' in the Corpus passed in as the parameter. The corpus_name parameter is if you want to name the file something other than Gutenberg (e.g. Gutenberg_female_authors) :param corpus_name: str """ corpus = Corpus(corpus_name) pickle_filename = f'dunning_he_vs_she_associated_words_{corpus_name}' try: results = load_pickle(pickle_filename) except IOError: he_counter = Counter() she_counter = Counter() for novel in corpus.novels: he_counter.update(novel.words_associated("he")) she_counter.update(novel.words_associated("she")) results = dunning_total(she_counter, he_counter, filename_to_pickle=pickle_filename) for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=20, corpus1_display_name='she...', corpus2_display_name='he..', part_of_speech_to_include=group)
def test_analysis(): """ This function contains all analysis code to be run (previously in main function) - First generates a Stanford NLP parser - Iterates over sample novels corpus and parses each novel (performs analysis: gender pronoun count, list of adjectives, list of verbs) - Writes output to dependency_analysis_results.csv """ parser = get_parser("assets/stanford-parser.jar", "assets/stanford-parser-3.9.1-models.jar") novels = Corpus('sample_novels').novels for novel in novels: try: row = parse_novel(novel, parser) print(row) with open('dependency_analysis_results.csv', mode='w') as results_file: writer = csv.writer(results_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(row) except OSError: continue
def male_VS_female_analysis_dunning(corpus_name, display_data = False): ''' tests word distinctiveness of shared words between male and female corpora using dunning Prints out the most distinctive terms overall as well as grouped by verbs, adjectives etc. :return: dict ''' # By default, try to load precomputed results. Only calculate if no stored results are # available. pickle_filename = f'dunning_male_vs_female_chars_{corpus_name}' try: results = load_pickle(pickle_filename) except IOError: c = Corpus(corpus_name) m_corpus = c.filter_by_gender('male') f_corpus = c.filter_by_gender('female') from collections import Counter wordcounter_male = Counter() wordcounter_female = Counter() for novel in m_corpus: wordcounter_male += novel.words_associated('he') for novel in f_corpus: wordcounter_female += novel.words_associated('he') # wordcounter_male = m_corpus.get_wordcount_counter() # wordcounter_female = f_corpus.get_wordcount_counter() results = dunning_total(wordcounter_male, wordcounter_female, filename_to_pickle=pickle_filename) if display_data: for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=20, corpus1_display_name='Fem Author', corpus2_display_name='Male Author', part_of_speech_to_include=group) return results
def create_corpus_summary_visualizations(corpus_name): ''' Runs through all plt functions given a corpus name :param corpus_name: str ''' c = Corpus(corpus_name) pubyears = [novel.date for novel in c.novels] pubgender = [novel.author_gender for novel in c.novels] pubcountry = [novel.country_publication for novel in c.novels] corpus_name = corpus_name.replace('_', ' ') plt_gender_breakdown(pubgender, corpus_name) plt_pubyears(pubyears, corpus_name) plt_pubcountries(pubcountry, corpus_name)
def test_analysis(): """ This function contains all analysis code to be run (previously in main function) """ parser = get_parser("assets/stanford-parser.jar", "assets/stanford-parser-3.9.1-models.jar") novels = Corpus('sample_novels').novels novel = novels[0] start = time.time() print(parse_novel(novel, parser)) end = time.time() print(end - start)
def run_analysis(corpus_name): """ Run instance distance analyses on a particular corpus and saves results as pickle files. Comment out sections of code or analyses that have already been run or are unnecessary. :param corpus_name: :return: """ print('loading corpus') corpus = Corpus(corpus_name) novels = corpus.novels print('running analysis') results = run_distance_analysis(novels) print('storing results') store_raw_results(results, corpus_name) r = common.load_pickle("instance_distance_raw_analysis_" + corpus_name) r2 = results_by_location(r, "mean") r3 = results_by_author_gender(r, "mean") r4 = results_by_date(r, "median") r5 = results_by_location(r, "median") r6 = results_by_author_gender(r, "median") r7 = results_by_date(r, "median") common.store_pickle(r2, "mean_instance_distances_by_location_" + corpus_name) common.store_pickle( r3, "mean_instance_distances_by_author_gender_" + corpus_name) common.store_pickle(r4, "mean_instance_distances_by_date_" + corpus_name) common.store_pickle(r5, "median_instance_distances_by_location_" + corpus_name) common.store_pickle( r6, "median_instance_distances_by_author_gender_" + corpus_name) common.store_pickle(r7, "median_instance_distances_by_date_" + corpus_name) pvals = get_p_vals("gutenberg") common.store_pickle(pvals, "instance_distance_comparison_pvals") male_top_twenty, female_top_twenty, diff_top_twenty = get_highest_distances( "gutenberg", 20) top_twenties = { 'male_pronoun_top_twenty': male_top_twenty, 'female_pronoun_top_twenty': female_top_twenty, "difference_top_twenty": diff_top_twenty } common.store_pickle(top_twenties, "instance_distance_top_twenties")
def test_analysis(): """ This function contains all analysis code to be run (previously in main function) """ parser = get_parser("assets/stanford-parser.jar", "assets/stanford-parser-3.9.1-models.jar") novels = Corpus('sample_novels').novels for novel in novels: row = parse_novel(novel, parser) print(row) with open('dependency_analysis_results.csv', mode='w') as results_file: writer = csv.writer(results_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(row)
def compare_word_association_in_corpus_analysis_dunning(word1, word2, corpus=None, corpus_name=None): """ Uses Dunning analysis to compare words associated with word1 vs words associated with word2 in the Corpus passed in as the parameter. If a corpus and corpus_name are passsed in, then the analysis will use the corpus but name the file after corpus_name. If no corpus is passed in but a corpus_name is, then the method will try to create a Corpus by corpus = Corpus(corpus_name). If neither a corpus nor a corpus_name is passed in, analysis is simply done on the Gutenberg corpus. :param word1: str :param word2: str :param corpus: Corpus :param corpus_name: str :return: dict """ if corpus: if not corpus_name: corpus_name = corpus.corpus_name else: if not corpus_name: corpus_name = "gutenberg" corpus = Corpus(corpus_name) pickle_filename = f'dunning_{word1}_vs_{word2}_associated_words_{corpus_name}' try: results = load_pickle(pickle_filename) except IOError: try: pickle_filename = f'dunning_{word2}_vs_{word1}_associated_words_{corpus_name}' results = load_pickle(pickle_filename) except: word1_counter = Counter() word2_counter = Counter() for novel in corpus.novels: word1_counter.update(novel.words_associated(word1)) word2_counter.update(novel.words_associated(word2)) results = dunning_total(word1_counter, word2_counter, filename_to_pickle=pickle_filename) for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=50, part_of_speech_to_include=group) return results
def run_analysis(corpus_name): print("loading corpus", corpus_name) corpus = Corpus(corpus_name) novels = corpus.novels print("running analysis") results = run_adj_analysis(novels) print("storing results") store_raw_results(results, corpus_name) r = common.load_pickle("pronoun_adj_raw_analysis"+corpus_name) m = merge_raw_results(r) final = get_overlapping_adjectives_raw_results(m) common.store_pickle(final, "pronoun_adj_final_results"+corpus_name) #Comment out pprint for large databases where it's not practical to print out results pprint(final)
def stat_analysis(corpus_name='sample_novels'): corpus = Corpus(corpus_name) tot_female_dict = books_pronoun_freq(corpus) author_to_freq_dict = freq_by_author_gender(tot_female_dict) author_gender_pronoun_analysis = get_p_and_ttest_value(author_to_freq_dict['male_author'],author_to_freq_dict[ "female_author"]) print("values for gender pronoun stats: ", author_gender_pronoun_analysis) sub_v_ob_tuple = subject_vs_object_pronoun_freqs(corpus) sub_v_ob_male_dict = sub_v_ob_tuple[0] sub_v_ob_male_list = dict_to_list(sub_v_ob_male_dict) sub_v_ob_female_dict = sub_v_ob_tuple[1] sub_v__ob_female_list = dict_to_list(sub_v_ob_female_dict) author_gender_sub_v_ob_correlation = get_p_and_ttest_value(sub_v_ob_male_list, sub_v__ob_female_list) print("values for subject vs object pronouns between male and female authors: ", author_gender_sub_v_ob_correlation)
ax.set_ylabel('Median Values') ax.set_title('Distance between Word Instances by Book and Author') ax.set_xticks(index + bar_width / 2) plt.xticks(fontsize=8, rotation=90) ax.set_xticklabels(book) ax.legend() fig.tight_layout() #plt.show() filepng = "visualizations/" + title + ".png" filepdf = "visualizations/" + title + ".pdf" plt.savefig(filepng, bbox_inches='tight') plt.savefig(filepdf, bbox_inches='tight') if __name__ == '__main__': corpus = Corpus('sample_novels') novels = corpus._load_novels() num = 0 #while num <10: medians_he = [] medians_she = [] books = [] for novel in novels[num * 10:num * 10 + 9]: result_he = instance_dist(novel, "he") result_she = instance_dist(novel, "she") try: medians_he.append(median(result_he)) except:
def run_all_analyses(): ''' Runs analyses for: Female and Male pronoun frequency for: author gender, publication date, publication, publication location Female and Male Subject Object frequency Comparison for: author gender, publication date, publication, publication location Prints results nicely :return: None ''' all_data = books_pronoun_freq(Corpus('gutenberg')) gender = freq_by_author_gender(all_data) date = freq_by_date(all_data) location = freq_by_location(all_data) print('Male/Female pronoun comparison: ') print('By author gender: ') print(get_mean(gender)) print('\n By date: ') print(get_mean(date)) print('\n By location: ') print(get_mean(location)) sub_v_ob = subject_vs_object_pronoun_freqs(Corpus('gutenberg')) female_gender_sub_v_ob = freq_by_author_gender(sub_v_ob[1]) female_date_sub_v_ob = freq_by_date(sub_v_ob[1]) female_loc_sub_v_ob = freq_by_location(sub_v_ob[1]) male_gender_sub_v_ob = freq_by_author_gender(sub_v_ob[0]) male_date_sub_v_ob = freq_by_date(sub_v_ob[0]) male_loc_sub_v_ob = freq_by_location(sub_v_ob[0]) male_tot = dict_to_list(sub_v_ob[0]) female_tot = dict_to_list(sub_v_ob[1]) print('Subject/Object comparisons: ') print('Male vs Female in the subject: ') print('Male: ') pprint.pprint(np.mean(male_tot)) print('Female: ') pprint.pprint(np.mean(female_tot)) print('\n Female pronouns: ') print('By author gender: ') pprint.pprint(get_mean(female_gender_sub_v_ob)) print('By date: ') pprint.pprint(get_mean(female_date_sub_v_ob)) print('By location: ') pprint.pprint(get_mean(female_loc_sub_v_ob)) print('\n Male pronouns: ') print('By author gender: ') pprint.pprint(get_mean(male_gender_sub_v_ob)) print('By date:') pprint.pprint(get_mean(male_date_sub_v_ob)) print('By location: ') pprint.pprint(get_mean(male_loc_sub_v_ob)) sub_comp_gender = subject_pronouns_gender_comparison(Corpus('gutenberg'), 'female') sub_comp_gender_list = dict_to_list(sub_comp_gender) print('Overall comparative female freq:') pprint.pprint(np.mean(sub_comp_gender_list)) print('By author gender:') pprint.pprint(get_mean(freq_by_author_gender(sub_comp_gender))) print('By date: ') pprint.pprint(get_mean(freq_by_date(sub_comp_gender))) print('By location: ') pprint.pprint(get_mean(freq_by_location(sub_comp_gender)))
def test_dunning_total(self): c = Corpus('sample_novels') m_corpus = c.filter_by_gender('male') f_corpus = c.filter_by_gender('female') results = dunning_total(m_corpus, f_corpus) print(results[10::])
class Test(unittest.TestCase): def test_dunning_total(self): c = Corpus('sample_novels') m_corpus = c.filter_by_gender('male') f_corpus = c.filter_by_gender('female') results = dunning_total(m_corpus, f_corpus) print(results[10::]) #print(reversed(results[-100::])) if __name__ == '__main__': # unittest.main() ''' print("loading corpus") corpus = Corpus('sample_novels') print("loading novel") novel = corpus._load_novels()[15] print(novel.author, novel.title, novel.word_count) print("running function") result = find_male_adj(novel) output = [] for key in result.keys(): output.append((result[key], key)) print(sorted(output, reverse=True)) ''' c = Corpus('sample_novels') run_dist_inst(c) run_gender_freq(c) print("hello")
... 'filename': None, 'text': summary} >>> scarlett = novel.Novel(novel_metadata) >>> find_female_adj(scarlett) {'beautiful': 3, 'sad': 1} :param:novel :return: dictionary of adjectives that appear around female pronouns and the number of occurences """ return find_gender_adj(novel, True) if __name__ == '__main__': test_function() print("loading corpus") corpus = Corpus('sample_novels') print("loading novel") novel = corpus._load_novels()[15] print(novel.author, novel.title, novel.word_count) print("running function") result = find_male_adj(novel) output = [] for key in result.keys(): output.append((result[key], key)) print(sorted(output, reverse=True)) def process_medians(helst, shelst, authlst): """ >>> medians_he = [12, 130, 0, 12, 314, 18, 15, 12, 123] >>> medians_she = [123, 52, 12, 345, 0, 13, 214, 12, 23]
if __name__ == "__main__": ''' Finds the minimum p-value to deem the relationship between metadata variables and analysis results significant Independent variables (metadata) include: author gender year of publication country of publication Dependent variables: distance between 'he' and 'she' the frequency of gendered pronouns used as subjects or objects ''' corp = Corpus('test_corpus') # corp = Corpus('gutenberg') # corp = Corpus('sample_novels') subject_female_pronoun_dict = gender_pronoun_freq_analysis.subject_pronouns_gender_comparison( corp, 'female') # create lists for novels, publication date, etc. with all entries in the same # corresponding order novel_list = [] novel_year_list = [] novel_author_gender_list = [] subject_female_pronoun_list = [] for novel in corp: novel_list.append(novel) novel_year_list.append(novel.date) novel_author_gender_list.append(novel.author_gender)