Esempio n. 1
0
def male_vs_female_authors_analysis_dunning(corpus_name, display_results=False):
    '''
    tests word distinctiveness of shared words between male and female authors using dunning
    If called with display_results=True, prints out the most distinctive terms overall as well as
    grouped by verbs, adjectives etc.
    Returns a dict of all terms in the corpus mapped to the dunning data for each term

    :return:dict
    '''

    # By default, try to load precomputed results. Only calculate if no stored results are
    # available.
    pickle_filename = f'dunning_male_vs_female_authors_{corpus_name}'
    try:
        results = load_pickle(pickle_filename)
    except IOError:

        c = Corpus(corpus_name)
        m_corpus = c.filter_by_gender('male')
        f_corpus = c.filter_by_gender('female')
        wordcounter_male = m_corpus.get_wordcount_counter()
        wordcounter_female = f_corpus.get_wordcount_counter()
        results = dunning_total(wordcounter_female, wordcounter_male,
                                filename_to_pickle=pickle_filename)

    if display_results:
        for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
            dunning_result_displayer(results, number_of_terms_to_display=20,
                                     corpus1_display_name='Fem Author',
                                     corpus2_display_name='Male Author',
                                     part_of_speech_to_include=group)
    return results
Esempio n. 2
0
def compare_word_association_between_corpus_analysis_dunning(word, corpus1=None, corpus1_name=None,
                                                             corpus2=None, corpus2_name=None, use_word_window=False, word_window=None):
    """
    Uses Dunning analysis to compare words associated with word between corpuses.  If a corpus and corpus_name are
    passsed in, then the analysis will use the corpus but name the file after corpus_name.  If no corpus is passed in but
    a corpus_name is, then the method will try to create a Corpus by corpus = Corpus(corpus_name).
    If neither a corpus nor a corpus_name is passed in, analysis is simply done on the Gutenberg
    corpus.

    :param word1: str
    :param corpus: Corpus
    :param corpus_name: str
    :return: dict
    """

    if corpus1:
        if not corpus1_name:
            corpus1_name = corpus1.corpus_name
    else:
        if not corpus1_name:
            corpus1_name = "gutenberg"
        corpus1 = Corpus(corpus1_name)

    if corpus2:
        if not corpus2_name:
            corpus2_name = corpus2.corpus_name
    else:
        if not corpus2_name:
            corpus2_name = "gutenberg"
        corpus2 = Corpus(corpus2_name)
    pickle_filename = (f'dunning_{word}_associated_words_{corpus1_name}_vs_{corpus2_name}_in_'
                       f'{corpus1.corpus_name}')
    if use_word_window:
        pickle_filename+= f'_word_window_{word_window}'
    try:
        results = load_pickle(pickle_filename)
    except IOError:
        print("Precalculated result not available. Running analysis now...")
        corpus1_counter = Counter()
        corpus2_counter = Counter()
        for novel in corpus1.novels:
            if use_word_window:
                get_word_windows(self, search_terms, window_size=word_window)
            else:
                corpus1_counter.update(novel.words_associated(word))
        for novel in corpus2.novels:
            if use_word_window:
                get_word_windows(self, search_terms, window_size=word_window)
            else:
                corpus2_counter.update(novel.words_associated(word))
        results = dunning_total(corpus1_counter, corpus2_counter,
                                filename_to_pickle=pickle_filename)

    for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
        dunning_result_displayer(results, number_of_terms_to_display=20,
                                 corpus1_display_name=f'{corpus1_name}. {word}',
                                 corpus2_display_name=f'{corpus2_name}. {word}',
                                 part_of_speech_to_include=group)

    return results
Esempio n. 3
0
def male_VS_female_analysis_dunning(corpus_name):
    '''
    tests word distinctiveness of shared words between male and female corpora using dunning
    Prints out the most distinctive terms overall as well as grouped by verbs, adjectives etc.

    :return: dict
    '''

    # By default, try to load precomputed results. Only calculate if no stored results are
    # available.
    pickle_filename = f'dunning_male_vs_female_authors_{corpus_name}'
    try:
        results = load_pickle(pickle_filename)
    except IOError:

        c = Corpus(corpus_name)
        m_corpus = c.filter_by_gender('male')
        f_corpus = c.filter_by_gender('female')
        wordcounter_male = m_corpus.get_wordcount_counter()
        wordcounter_female = f_corpus.get_wordcount_counter()
        results = dunning_total(wordcounter_male,
                                wordcounter_female,
                                filename_to_pickle=pickle_filename)

    for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
        dunning_result_displayer(results,
                                 number_of_terms_to_display=50,
                                 part_of_speech_to_include=group)
Esempio n. 4
0
def male_characters_author_gender_differences(corpus_name):
    """
    Compares how male authors versus female authors write male characters by looking at the words
    that follow 'he'

    :param corpus_name:
    :return:
    """
    male_corpus = Corpus(corpus_name).filter_by_gender('male')
    female_corpus = Corpus(corpus_name).filter_by_gender('female')
    compare_word_association_between_corpus_analysis_dunning(word='he',
            corpus1=female_corpus, corpus1_name='female aut',
            corpus2=male_corpus,   corpus2_name='male aut')
Esempio n. 5
0
def money_author_gender_differences(corpus_name):
    """
    Compares how male authors versus female authors refer to money by looking at the words
   before and after money'

    :param corpus_name:
    :return:
    """
    male_corpus = Corpus(corpus_name).filter_by_gender('male')
    female_corpus = Corpus(corpus_name).filter_by_gender('female')
    compare_word_association_between_corpus_analysis_dunning(word=['money','dollars', 'pounds', 'euros', 'dollar', 'pound','euro', 'wealth', 'income'],
            corpus1=female_corpus, corpus1_name='female aut',
            corpus2=male_corpus,   corpus2_name='male aut')
Esempio n. 6
0
def america_author_gender_differences(corpus_name):
    """
    Compares how American male authors versus female authors refer to America by looking at the words
    that follow 'America'

    :param corpus_name:
    :return:
    """
    male_corpus = Corpus(corpus_name).filter_by_gender('male')
    female_corpus = Corpus(corpus_name).filter_by_gender('female')
    compare_word_association_between_corpus_analysis_dunning(word='America',
            corpus1=female_corpus, corpus1_name='female aut',
            corpus2=male_corpus,   corpus2_name='male aut')
Esempio n. 7
0
def male_vs_female_authors_analysis_dunning_lesser():
    '''
    tests word distinctiveness of shared words between male and female corpora using dunning
    :return: dictionary of common shared words and their distinctiveness
    '''
    c = Corpus('test_corpus')
    m_corpus = c.filter_by_gender('male')
    f_corpus = c.filter_by_gender('female')
    wordcounter_male = m_corpus.get_wordcount_counter()
    wordcounter_female = f_corpus.get_wordcount_counter()
    results = dunning_total(wordcounter_male, wordcounter_female)
    print("women's top 10: ", results[0:10])
    print("men's top 10: ", list(reversed(results[-10:])))
    return results
Esempio n. 8
0
def he_vs_she_associations_analysis_dunning(corpus_name):
    """
    Uses Dunning analysis to compare words associated with 'he' vs words associated with 'she' in
    the Corpus passed in as the parameter.  The corpus_name parameter is if you want to name the file
    something other than Gutenberg (e.g. Gutenberg_female_authors)
    :param corpus_name: str
    """

    corpus = Corpus(corpus_name)
    pickle_filename = f'dunning_he_vs_she_associated_words_{corpus_name}'
    try:
        results = load_pickle(pickle_filename)
    except IOError:
        he_counter = Counter()
        she_counter = Counter()
        for novel in corpus.novels:
            he_counter.update(novel.words_associated("he"))
            she_counter.update(novel.words_associated("she"))
        results = dunning_total(she_counter, he_counter, filename_to_pickle=pickle_filename)

    for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
        dunning_result_displayer(results, number_of_terms_to_display=20,
                                 corpus1_display_name='she...',
                                 corpus2_display_name='he..',
                                 part_of_speech_to_include=group)
def test_analysis():
    """
    This function contains all analysis code to be run (previously in main function)
    - First generates a Stanford NLP parser
    - Iterates over sample novels corpus and parses each novel (performs analysis: gender pronoun
    count, list of adjectives, list of verbs)
    - Writes output to dependency_analysis_results.csv
    """

    parser = get_parser("assets/stanford-parser.jar",
                        "assets/stanford-parser-3.9.1-models.jar")
    novels = Corpus('sample_novels').novels
    for novel in novels:
        try:
            row = parse_novel(novel, parser)
            print(row)
            with open('dependency_analysis_results.csv',
                      mode='w') as results_file:
                writer = csv.writer(results_file,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL)
                writer.writerow(row)
        except OSError:
            continue
Esempio n. 10
0
def male_VS_female_analysis_dunning(corpus_name, display_data = False):
    '''
    tests word distinctiveness of shared words between male and female corpora using dunning
    Prints out the most distinctive terms overall as well as grouped by verbs, adjectives etc.

    :return: dict
    '''


    # By default, try to load precomputed results. Only calculate if no stored results are
    # available.
    pickle_filename = f'dunning_male_vs_female_chars_{corpus_name}'
    try:
        results = load_pickle(pickle_filename)
    except IOError:

        c = Corpus(corpus_name)
        m_corpus = c.filter_by_gender('male')
        f_corpus = c.filter_by_gender('female')

        from collections import Counter
        wordcounter_male = Counter()
        wordcounter_female = Counter()

        for novel in m_corpus:
            wordcounter_male += novel.words_associated('he')

        for novel in f_corpus:
            wordcounter_female += novel.words_associated('he')


#        wordcounter_male = m_corpus.get_wordcount_counter()
#        wordcounter_female = f_corpus.get_wordcount_counter()
        results = dunning_total(wordcounter_male, wordcounter_female,
                                filename_to_pickle=pickle_filename)
    if display_data:
        for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
            dunning_result_displayer(results, number_of_terms_to_display=20,
                                     corpus1_display_name='Fem Author',
                                     corpus2_display_name='Male Author',
                                     part_of_speech_to_include=group)
    return results
Esempio n. 11
0
def create_corpus_summary_visualizations(corpus_name):
    '''
    Runs through all plt functions given a corpus name
    :param corpus_name: str
    '''
    c = Corpus(corpus_name)
    pubyears = [novel.date for novel in c.novels]
    pubgender = [novel.author_gender for novel in c.novels]
    pubcountry = [novel.country_publication for novel in c.novels]
    corpus_name = corpus_name.replace('_', ' ')
    plt_gender_breakdown(pubgender, corpus_name)
    plt_pubyears(pubyears, corpus_name)
    plt_pubcountries(pubcountry, corpus_name)
Esempio n. 12
0
def test_analysis():
    """
    This function contains all analysis code to be run (previously in main function)
    """

    parser = get_parser("assets/stanford-parser.jar",
                        "assets/stanford-parser-3.9.1-models.jar")

    novels = Corpus('sample_novels').novels
    novel = novels[0]
    start = time.time()
    print(parse_novel(novel, parser))
    end = time.time()
    print(end - start)
def run_analysis(corpus_name):
    """
    Run instance distance analyses on a particular corpus and saves results as pickle files.
    Comment out sections of code or analyses that have already been run or are unnecessary.
    :param corpus_name:
    :return:
    """
    print('loading corpus')
    corpus = Corpus(corpus_name)
    novels = corpus.novels

    print('running analysis')
    results = run_distance_analysis(novels)

    print('storing results')
    store_raw_results(results, corpus_name)

    r = common.load_pickle("instance_distance_raw_analysis_" + corpus_name)
    r2 = results_by_location(r, "mean")
    r3 = results_by_author_gender(r, "mean")
    r4 = results_by_date(r, "median")
    r5 = results_by_location(r, "median")
    r6 = results_by_author_gender(r, "median")
    r7 = results_by_date(r, "median")

    common.store_pickle(r2,
                        "mean_instance_distances_by_location_" + corpus_name)
    common.store_pickle(
        r3, "mean_instance_distances_by_author_gender_" + corpus_name)
    common.store_pickle(r4, "mean_instance_distances_by_date_" + corpus_name)

    common.store_pickle(r5,
                        "median_instance_distances_by_location_" + corpus_name)
    common.store_pickle(
        r6, "median_instance_distances_by_author_gender_" + corpus_name)
    common.store_pickle(r7, "median_instance_distances_by_date_" + corpus_name)

    pvals = get_p_vals("gutenberg")
    common.store_pickle(pvals, "instance_distance_comparison_pvals")

    male_top_twenty, female_top_twenty, diff_top_twenty = get_highest_distances(
        "gutenberg", 20)
    top_twenties = {
        'male_pronoun_top_twenty': male_top_twenty,
        'female_pronoun_top_twenty': female_top_twenty,
        "difference_top_twenty": diff_top_twenty
    }
    common.store_pickle(top_twenties, "instance_distance_top_twenties")
Esempio n. 14
0
def test_analysis():
    """
    This function contains all analysis code to be run (previously in main function)
    """

    parser = get_parser("assets/stanford-parser.jar",
                        "assets/stanford-parser-3.9.1-models.jar")
    novels = Corpus('sample_novels').novels
    for novel in novels:
        row = parse_novel(novel, parser)
        print(row)
        with open('dependency_analysis_results.csv', mode='w') as results_file:
            writer = csv.writer(results_file,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
            writer.writerow(row)
Esempio n. 15
0
def compare_word_association_in_corpus_analysis_dunning(word1, word2, corpus=None,
                                                        corpus_name=None):
    """
    Uses Dunning analysis to compare words associated with word1 vs words associated with word2 in
    the Corpus passed in as the parameter.  If a corpus and corpus_name are passsed in, then the
    analysis will use the corpus but name the file after corpus_name.  If no corpus is passed in but
    a corpus_name is, then the method will try to create a Corpus by corpus = Corpus(corpus_name).
    If neither a corpus nor a corpus_name is passed in, analysis is simply done on the Gutenberg
    corpus.
    :param word1: str
    :param word2: str
    :param corpus: Corpus
    :param corpus_name: str
    :return: dict
    """

    if corpus:
        if not corpus_name:
            corpus_name = corpus.corpus_name
    else:
        if not corpus_name:
            corpus_name = "gutenberg"
        corpus = Corpus(corpus_name)

    pickle_filename = f'dunning_{word1}_vs_{word2}_associated_words_{corpus_name}'
    try:
        results = load_pickle(pickle_filename)
    except IOError:
        try:
            pickle_filename = f'dunning_{word2}_vs_{word1}_associated_words_{corpus_name}'
            results = load_pickle(pickle_filename)
        except:
            word1_counter = Counter()
            word2_counter = Counter()
            for novel in corpus.novels:
                word1_counter.update(novel.words_associated(word1))
                word2_counter.update(novel.words_associated(word2))
            results = dunning_total(word1_counter, word2_counter,
                                    filename_to_pickle=pickle_filename)

    for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
        dunning_result_displayer(results, number_of_terms_to_display=50,
                                 part_of_speech_to_include=group)

    return results
def run_analysis(corpus_name):
    print("loading corpus", corpus_name)
    corpus = Corpus(corpus_name)
    novels = corpus.novels

    print("running analysis")
    results = run_adj_analysis(novels)

    print("storing results")
    store_raw_results(results, corpus_name)

    r = common.load_pickle("pronoun_adj_raw_analysis"+corpus_name)
    m = merge_raw_results(r)
    final = get_overlapping_adjectives_raw_results(m)
    common.store_pickle(final, "pronoun_adj_final_results"+corpus_name)

    #Comment out pprint for large databases where it's not practical to print out results
    pprint(final)
def stat_analysis(corpus_name='sample_novels'):
    corpus = Corpus(corpus_name)
    tot_female_dict = books_pronoun_freq(corpus)
    author_to_freq_dict = freq_by_author_gender(tot_female_dict)


    author_gender_pronoun_analysis = get_p_and_ttest_value(author_to_freq_dict['male_author'],author_to_freq_dict[
        "female_author"])
    print("values for gender pronoun stats: ", author_gender_pronoun_analysis)

    sub_v_ob_tuple = subject_vs_object_pronoun_freqs(corpus)

    sub_v_ob_male_dict = sub_v_ob_tuple[0]
    sub_v_ob_male_list = dict_to_list(sub_v_ob_male_dict)

    sub_v_ob_female_dict = sub_v_ob_tuple[1]
    sub_v__ob_female_list = dict_to_list(sub_v_ob_female_dict)

    author_gender_sub_v_ob_correlation = get_p_and_ttest_value(sub_v_ob_male_list, sub_v__ob_female_list)
    print("values for subject vs object pronouns between male and female authors: ", author_gender_sub_v_ob_correlation)
    ax.set_ylabel('Median Values')
    ax.set_title('Distance between Word Instances by Book and Author')
    ax.set_xticks(index + bar_width / 2)
    plt.xticks(fontsize=8, rotation=90)
    ax.set_xticklabels(book)
    ax.legend()

    fig.tight_layout()
    #plt.show()
    filepng = "visualizations/" + title + ".png"
    filepdf = "visualizations/" + title + ".pdf"
    plt.savefig(filepng, bbox_inches='tight')
    plt.savefig(filepdf, bbox_inches='tight')

if __name__ == '__main__':
    corpus = Corpus('sample_novels')
    novels = corpus._load_novels()

    num = 0


    #while num <10:
    medians_he = []
    medians_she = []
    books = []
    for novel in novels[num * 10:num * 10 + 9]:
        result_he = instance_dist(novel, "he")
        result_she = instance_dist(novel, "she")
        try:
            medians_he.append(median(result_he))
        except:
def run_all_analyses():
    '''
    Runs analyses for:
        Female and Male pronoun frequency for:
            author gender, publication date, publication, publication location
        Female and Male Subject Object frequency Comparison for:
            author gender, publication date, publication, publication location
    Prints results nicely
    :return: None
    '''
    all_data = books_pronoun_freq(Corpus('gutenberg'))

    gender = freq_by_author_gender(all_data)
    date = freq_by_date(all_data)
    location = freq_by_location(all_data)

    print('Male/Female pronoun comparison: ')
    print('By author gender: ')
    print(get_mean(gender))
    print('\n By date: ')
    print(get_mean(date))
    print('\n By location: ')
    print(get_mean(location))

    sub_v_ob = subject_vs_object_pronoun_freqs(Corpus('gutenberg'))

    female_gender_sub_v_ob = freq_by_author_gender(sub_v_ob[1])
    female_date_sub_v_ob = freq_by_date(sub_v_ob[1])
    female_loc_sub_v_ob = freq_by_location(sub_v_ob[1])

    male_gender_sub_v_ob = freq_by_author_gender(sub_v_ob[0])
    male_date_sub_v_ob = freq_by_date(sub_v_ob[0])
    male_loc_sub_v_ob = freq_by_location(sub_v_ob[0])

    male_tot = dict_to_list(sub_v_ob[0])
    female_tot = dict_to_list(sub_v_ob[1])

    print('Subject/Object comparisons: ')
    print('Male vs Female in the subject: ')
    print('Male: ')
    pprint.pprint(np.mean(male_tot))
    print('Female: ')
    pprint.pprint(np.mean(female_tot))
    print('\n Female pronouns: ')
    print('By author gender: ')
    pprint.pprint(get_mean(female_gender_sub_v_ob))
    print('By date: ')
    pprint.pprint(get_mean(female_date_sub_v_ob))
    print('By location: ')
    pprint.pprint(get_mean(female_loc_sub_v_ob))
    print('\n Male pronouns: ')
    print('By author gender: ')
    pprint.pprint(get_mean(male_gender_sub_v_ob))
    print('By date:')
    pprint.pprint(get_mean(male_date_sub_v_ob))
    print('By location: ')
    pprint.pprint(get_mean(male_loc_sub_v_ob))

    sub_comp_gender = subject_pronouns_gender_comparison(Corpus('gutenberg'), 'female')
    sub_comp_gender_list = dict_to_list(sub_comp_gender)

    print('Overall comparative female freq:')
    pprint.pprint(np.mean(sub_comp_gender_list))
    print('By author gender:')
    pprint.pprint(get_mean(freq_by_author_gender(sub_comp_gender)))
    print('By date: ')
    pprint.pprint(get_mean(freq_by_date(sub_comp_gender)))
    print('By location: ')
    pprint.pprint(get_mean(freq_by_location(sub_comp_gender)))
Esempio n. 20
0
 def test_dunning_total(self):
     c = Corpus('sample_novels')
     m_corpus = c.filter_by_gender('male')
     f_corpus = c.filter_by_gender('female')
     results = dunning_total(m_corpus, f_corpus)
     print(results[10::])
Esempio n. 21
0
class Test(unittest.TestCase):
    def test_dunning_total(self):
        c = Corpus('sample_novels')
        m_corpus = c.filter_by_gender('male')
        f_corpus = c.filter_by_gender('female')
        results = dunning_total(m_corpus, f_corpus)
        print(results[10::])
        #print(reversed(results[-100::]))


if __name__ == '__main__':
    # unittest.main()
    '''
    print("loading corpus")
    corpus = Corpus('sample_novels')
    print("loading novel")
    novel = corpus._load_novels()[15]
    print(novel.author, novel.title, novel.word_count)
    print("running function")
    result = find_male_adj(novel)
    output = []
    for key in result.keys():
        output.append((result[key], key))
    print(sorted(output, reverse=True))
    '''
    c = Corpus('sample_novels')
    run_dist_inst(c)
    run_gender_freq(c)
    print("hello")
Esempio n. 22
0
       ...                   'filename': None, 'text': summary}
       >>> scarlett = novel.Novel(novel_metadata)
       >>> find_female_adj(scarlett)
       {'beautiful': 3, 'sad': 1}

       :param:novel
       :return: dictionary of adjectives that appear around female pronouns and the number of occurences

       """
    return find_gender_adj(novel, True)


if __name__ == '__main__':
    test_function()
    print("loading corpus")
    corpus = Corpus('sample_novels')
    print("loading novel")
    novel = corpus._load_novels()[15]
    print(novel.author, novel.title, novel.word_count)
    print("running function")
    result = find_male_adj(novel)
    output = []
    for key in result.keys():
        output.append((result[key], key))
    print(sorted(output, reverse=True))


def process_medians(helst, shelst, authlst):
    """
    >>> medians_he = [12, 130, 0, 12, 314, 18, 15, 12, 123]
    >>> medians_she = [123, 52, 12, 345, 0,  13, 214, 12, 23]
Esempio n. 23
0
if __name__ == "__main__":
    '''
    Finds the minimum p-value to deem the relationship between metadata variables and analysis 
    results significant
    
    Independent variables (metadata) include:
        author gender
        year of publication
        country of publication
    
    Dependent variables:
        distance between 'he' and 'she'
        the frequency of gendered pronouns used as subjects or objects
    '''

    corp = Corpus('test_corpus')
    # corp = Corpus('gutenberg')
    # corp = Corpus('sample_novels')
    subject_female_pronoun_dict = gender_pronoun_freq_analysis.subject_pronouns_gender_comparison(
        corp, 'female')
    # create lists for novels, publication date, etc. with all entries in the same
    # corresponding order
    novel_list = []
    novel_year_list = []
    novel_author_gender_list = []
    subject_female_pronoun_list = []

    for novel in corp:
        novel_list.append(novel)
        novel_year_list.append(novel.date)
        novel_author_gender_list.append(novel.author_gender)