def male_female_descendant_numbers_by_decade():

    for start_year in [1980, 1990, 2000]:
        d = DissertationDataset()
        d.filter(start_year=start_year, end_year=start_year + 9)
        df = d.df

        male_desc = len(df[(df.m_author_genders == 'male')
                           & (df.m_descendants > 0)])
        male_nodesc = len(df[(df.m_author_genders == 'male')
                             & (df.m_descendants == 0)])
        female_desc = len(df[(df.m_author_genders == 'female')
                             & (df.m_descendants > 0)])
        female_nodesc = len(df[(df.m_author_genders == 'female')
                               & (df.m_descendants == 0)])

        _, fisher_p = fisher_exact([[male_desc, male_nodesc],
                                    [female_desc, female_nodesc]])

        print(f'\n{start_year}s')
        print('Men:   {:5d} theses, {:4d} with descendants. {:2.2f}%'.format(
            male_desc + male_nodesc, male_desc,
            male_desc / (male_desc + male_nodesc) * 100))
        print('Women: {:5d} theses, {:4d} with descendants. {:2.2f}%'.format(
            female_desc + female_nodesc, female_desc,
            female_desc / (female_desc + female_nodesc) * 100))
        print('Fisher\'s Exact Test p-value: {:0.3f}'.format(fisher_p))
Exemple #2
0
def overall_analyses():

    for dataset_name in ['journals', 'dissertations']:
        for analysis_type in ['topics', 'terms']:

            print('\n\n\n', dataset_name, analysis_type)

            if dataset_name == 'journals':
                d = JournalsDataset()
            else:
                d = DissertationDataset()

            # Create two sub-datasets, one for female authors and one for male authors
            c1 = d.copy().filter(author_gender='female')
            c2 = d.copy().filter(author_gender='male')

            # Run the divergence analysis
            div = DivergenceAnalysis(d,
                                     c1,
                                     c2,
                                     sub_corpus1_name='women',
                                     sub_corpus2_name='men',
                                     analysis_type=analysis_type,
                                     sort_by='dunning')
            div.run_divergence_analysis(number_of_terms_or_topics_to_print=12)
Exemple #3
0
def draw_journal_and_dissertation_overview():

    for use_absolute_weights in [True, False]:
        for dataset_name in ['dissertations', 'journals']:

            if dataset_name == 'journals':
                dataset = JournalsDataset()
                dataset.filter(start_year=1951, end_year=2010)
                title = 'Journals, 1950-2010'
            else:
                dataset = DissertationDataset()
                dataset.filter(start_year=1980, end_year=2010)
                title = 'Dissertations, 1980-2010'

            filename = f'topic_scatter_{dataset_name}.png'
            if use_absolute_weights:
                filename = f'topic_scatter_{dataset_name}_absolute_weights.png'
                title += ', Absolute Weights'

            draw_gender_frequency_scatterplot(
                dataset,
                figsize=12,
                show_labels=True,
                transparent_image=False,
                dynamic_y_coords=False,
                filename=filename,
                show_plot=True,
                title=title,
                use_absolute_weights=use_absolute_weights)
Exemple #4
0
def get_number_of_male_and_female_authored_articles_by_year(
        start_year, end_year, dataset_name) -> (list, list, list):
    """
    returns number of male and female authored articles and list of years for male/female subplot

    male + female data add up to 1, i.e. they show percentages of articles written by men and
    women

    :param start_year:
    :param end_year:
    :return:
    """

    if dataset_name == 'journals':
        d = JournalsDataset()
    else:
        d = DissertationDataset()

    male_data = [0] * (d.end_year - d.start_year + 1)
    female_data = [0] * (d.end_year - d.start_year + 1)

    for _, row in d.df.iterrows():
        if row.m_author_genders == 'male':
            male_data[row.m_year - d.start_year] += 1
        elif row.m_author_genders == 'female':
            female_data[row.m_year - d.start_year] += 1
        else:
            pass

    rolling_mean_male = pd.DataFrame(male_data).rolling(
        center=True, window=5, min_periods=1).mean()[0].tolist()
    rolling_mean_female = pd.DataFrame(female_data).rolling(
        center=True, window=5, min_periods=1).mean()[0].tolist()

    male_data = np.array(rolling_mean_male[start_year - d.start_year:end_year +
                                           1 - d.start_year])
    female_data = np.array(
        rolling_mean_female[start_year - d.start_year:end_year + 1 -
                            d.start_year])

    totals = male_data + female_data
    male_data = male_data / totals
    female_data = female_data / totals

    years = [i for i in range(start_year, end_year + 1)]

    assert len(male_data) == len(female_data) == len(years)

    return male_data, female_data, years
def show_male_female_publications_over_time(dataset='journals'):
    """
    Quick visualization of number of articles by men and women

    :return:
    """

    if dataset == 'journals':
        d = JournalsDataset()
    else:
        d = DissertationDataset()
        d.filter(start_year=1980)
    male_counter = Counter()
    female_counter = Counter()

    for _, row in d.df.iterrows():

        if row.m_author_genders == 'male':
            male_counter[row.m_year] += 1
        if row.m_author_genders == 'female':
            female_counter[row.m_year] += 1

    male_arr = []
    female_arr = []
    for year in range(d.start_year, d.end_year + 1):
        male_arr.append(male_counter[year])
        female_arr.append(female_counter[year])

    rolling_female = np.array(
        pd.DataFrame(female_arr).rolling(center=True,
                                         window=5).mean()[0].tolist()[2:-5])
    rolling_male = np.array(
        pd.DataFrame(male_arr).rolling(center=True,
                                       window=5).mean()[0].tolist()[2:-5])

    x = [i for i in range(d.start_year, d.end_year + 1)][2:-5]

    plt.figure(figsize=(6, 6))
    plt.plot(x, rolling_female / (rolling_female + rolling_male), color='blue')
    # plt.plot(x, rolling_male, color='red')

    plt.title('Articles by men (blue) and women (red)')

    plt.savefig(
        Path(BASE_PATH, 'visualizations', 'dataset_summaries',
             'male_female_articles.png'))
    plt.show()

    return rolling_male, rolling_female, x
def topics_and_descendants_female():

    d = DissertationDataset()
    d.filter(start_year=1980, end_year=1999)
    d.filter(author_gender='female')

    c1 = d.copy().filter(has_descendants=True)
    c2 = d.copy().filter(has_descendants=False)

    div = DivergenceAnalysis(d,
                             c1,
                             c2,
                             sub_corpus1_name='has descendants',
                             sub_corpus2_name='no descendants',
                             analysis_type='topics',
                             sort_by='dunning')
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=10)
Exemple #7
0
def topics_and_descendants_female_student():

    d = DissertationDataset()
    d.filter(start_year=1990, end_year=2015)
    d.filter(author_gender='female')

    c1 = d.copy().filter(advisor_gender='female')
    c2 = d.copy().filter(advisor_gender='male')

    div = DivergenceAnalysis(d,
                             c1,
                             c2,
                             sub_corpus1_name='woman with female advisor',
                             sub_corpus2_name='woman with male advisor',
                             analysis_type='gen_approach',
                             sort_by='dunning')
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=10)
Exemple #8
0
def male_female_advisor_numbers():

    for start_year in [1980, 1990, 2000, 2010]:

        d = DissertationDataset()
        d.filter(start_year=start_year, end_year=start_year + 9)
        df = d.df

        male_students = df[(df.m_author_genders == 'male')]
        female_students = df[(df.m_author_genders == 'female')]

        male_maleadv = len(
            male_students[male_students.m_advisor_gender == 'male'])
        male_femaleadv = len(
            male_students[male_students.m_advisor_gender == 'female'])
        male_unkadv = len(
            male_students[male_students.m_advisor_gender == 'unknown'])

        female_maleadv = len(
            female_students[female_students.m_advisor_gender == 'male'])
        female_femaleadv = len(
            female_students[female_students.m_advisor_gender == 'female'])
        female_unkadv = len(
            female_students[female_students.m_advisor_gender == 'unknown'])

        print(f'\n{start_year}s')

        print(
            'Men:   {:4d} male advisors, {:4d} female advisors. {:4d} unknown advisors. {:2.2f}% female advisors'
            .format(male_maleadv, male_femaleadv, male_unkadv,
                    male_femaleadv / (male_maleadv + male_femaleadv) * 100))
        print(
            'Women: {:4d} male advisors, {:4d} female advisors. {:4d} unknown advisors. {:2.2f}% female advisors'
            .format(
                female_maleadv, female_femaleadv, female_unkadv,
                female_femaleadv / (female_maleadv + female_femaleadv) * 100))
    def get_stanford_name_to_gender_dict():

        try:
            with open(Path('data', 'stanford_name_to_gender.json'), 'r') as infile:
                 return json.load(infile)
        except FileNotFoundError:
            d = DissertationDataset()
            stanford_name_to_gender_dict = {}
            for _, row in d.df.iterrows():
                name = row.AdviseeID.split(':')[0].replace('_', ' ').replace('-', ' ')
                name = " ".join([x.capitalize() for x in name.split()])
                human_name = HumanName(name)
                guess_proquest = row['AdviseeGender.1']
                stanford_name_to_gender_dict[human_name.first] = guess_proquest
            # store dict of stanford names to gender
            with open(Path('data', 'gender_inference', 'stanford_name_to_gender.json'), 'w') as out:
                json.dump(stanford_name_to_gender_dict, out, sort_keys=True, indent=4)
            return stanford_name_to_gender_dict
Exemple #10
0
def analysis_military_history():

    for dataset_name in ['journals']:
        for analysis_type in ['topics']:

            print('\n\n\n', dataset_name, analysis_type)

            if dataset_name == 'journals':
                d = JournalsDataset()
            else:
                d = DissertationDataset()

            if analysis_type == 'topics':
                compare_to_overall_weights = True
            else:
                compare_to_overall_weights = False

            # retain only the articles scoring in the top 10% for topic 31 (military history)
            d.topic_score_filter(31, min_percentile_score=90)

            # Create two sub-datasets, one for female authors and one for male authors
            c1 = d.copy().filter(author_gender='female')
            c2 = d.copy().filter(author_gender='male')

            div = DivergenceAnalysis(
                d,
                c1,
                c2,
                sub_corpus1_name='women',
                sub_corpus2_name='men',
                analysis_type=analysis_type,
                sort_by='dunning',
                compare_to_overall_weights=compare_to_overall_weights)
            div.run_divergence_analysis(number_of_terms_or_topics_to_print=10)

            div.print_articles_for_top_topics(top_terms_or_topics=10,
                                              articles_per_term_or_topic=5)
Exemple #11
0
def draw_all_years():

    for start_year in range(1960, 2010, 10):
        for dataset_name in ['dissertations', 'journals']:

            if dataset_name == 'journals':
                dataset = JournalsDataset()
            else:
                dataset = DissertationDataset()
                if start_year < 1980:
                    continue

            dataset.filter(start_year=start_year, end_year=start_year + 9)

            draw_gender_frequency_scatterplot(
                dataset,
                figsize=12,
                show_labels=True,
                transparent_image=False,
                dynamic_y_coords=False,
                filename=
                f'topic_scatter_{dataset_name}_{start_year}-{start_year+9}.png',
                show_plot=True,
                title=f'{dataset_name.capitalize()}, {start_year}s')