Example #1
0
def draw_journal_and_dissertation_overview():

    for use_absolute_weights in [True, False]:
        for dataset_name in ['dissertations', 'journals']:

            if dataset_name == 'journals':
                dataset = JournalsDataset()
                dataset.filter(start_year=1951, end_year=2010)
                title = 'Journals, 1950-2010'
            else:
                dataset = DissertationDataset()
                dataset.filter(start_year=1980, end_year=2010)
                title = 'Dissertations, 1980-2010'

            filename = f'topic_scatter_{dataset_name}.png'
            if use_absolute_weights:
                filename = f'topic_scatter_{dataset_name}_absolute_weights.png'
                title += ', Absolute Weights'

            draw_gender_frequency_scatterplot(
                dataset,
                figsize=12,
                show_labels=True,
                transparent_image=False,
                dynamic_y_coords=False,
                filename=filename,
                show_plot=True,
                title=title,
                use_absolute_weights=use_absolute_weights)
Example #2
0
def draw_heatmap():

    dataset = JournalsDataset()
    dataset.filter(author_gender='male')

    print(len(dataset))

    topic_selector = [f'topic.{i}' for i in range(1, 91)]
    topic_df = dataset.df[topic_selector]
    topic_id_to_name = {
        f'topic.{i}': dataset.topics[i]['name']
        for i in range(1, 91)
    }
    topic_df = topic_df.rename(columns=topic_id_to_name)

    correlations = topic_df.corr()

    for i in range(90):
        correlations.iat[i, i] = 0.0

    sns.clustermap(
        correlations,
        figsize=(20, 20),
        row_cluster=True,
        col_cluster=True,
        cmap='vlag',
        vmin=-0.25,
        vmax=0.25,
        method='ward',
        xticklabels=[dataset.topics[i]['name'] for i in range(1, 91)],
        yticklabels=[dataset.topics[i]['name'] for i in range(1, 91)])
    plt.show()
Example #3
0
def show_male_female_publications_over_time(dataset='journals'):
    """
    Quick visualization of number of articles by men and women

    :return:
    """

    if dataset == 'journals':
        d = JournalsDataset()
    else:
        d = DissertationDataset()
        d.filter(start_year=1980)
    male_counter = Counter()
    female_counter = Counter()

    for _, row in d.df.iterrows():

        if row.m_author_genders == 'male':
            male_counter[row.m_year] += 1
        if row.m_author_genders == 'female':
            female_counter[row.m_year] += 1

    male_arr = []
    female_arr = []
    for year in range(d.start_year, d.end_year + 1):
        male_arr.append(male_counter[year])
        female_arr.append(female_counter[year])

    rolling_female = np.array(
        pd.DataFrame(female_arr).rolling(center=True,
                                         window=5).mean()[0].tolist()[2:-5])
    rolling_male = np.array(
        pd.DataFrame(male_arr).rolling(center=True,
                                       window=5).mean()[0].tolist()[2:-5])

    x = [i for i in range(d.start_year, d.end_year + 1)][2:-5]

    plt.figure(figsize=(6, 6))
    plt.plot(x, rolling_female / (rolling_female + rolling_male), color='blue')
    # plt.plot(x, rolling_male, color='red')

    plt.title('Articles by men (blue) and women (red)')

    plt.savefig(
        Path(BASE_PATH, 'visualizations', 'dataset_summaries',
             'male_female_articles.png'))
    plt.show()

    return rolling_male, rolling_female, x
Example #4
0
def analysis_term_gender():

    d = JournalsDataset()
    d.filter(term_filter={'term': 'gender', 'min_count': 10})
    c1 = d.copy().filter(author_gender='male')
    c2 = d.copy().filter(author_gender='female')

    print(len(c1), len(c2), len(d))

    # Run the divergence analysis
    div = DivergenceAnalysis(d,
                             c1,
                             c2,
                             sub_corpus1_name='male',
                             sub_corpus2_name='female',
                             analysis_type='terms',
                             sort_by='frequency_score',
                             compare_to_overall_weights=False,
                             use_default_vocabulary=False)
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=20)
Example #5
0
def draw_all_years():

    for start_year in range(1960, 2010, 10):
        for dataset_name in ['dissertations', 'journals']:

            if dataset_name == 'journals':
                dataset = JournalsDataset()
            else:
                dataset = DissertationDataset()
                if start_year < 1980:
                    continue

            dataset.filter(start_year=start_year, end_year=start_year + 9)

            draw_gender_frequency_scatterplot(
                dataset,
                figsize=12,
                show_labels=True,
                transparent_image=False,
                dynamic_y_coords=False,
                filename=
                f'topic_scatter_{dataset_name}_{start_year}-{start_year+9}.png',
                show_plot=True,
                title=f'{dataset_name.capitalize()}, {start_year}s')
    # generate_average_overall_freq_scores()


    # d = DissertationDataset()
    d = JournalsDataset()



    # d.filter(start_year=1990)

    # d.topic_score_filter(31, min_topic_weight=0.1)
    #d.topic_score_filter(21, min_percentile_score=95)

    # d.filter(term_filter={'term':'wom[ae]n', 'min_count': 10})
    d.filter(term_filter={'term':'gender', 'min_count': 10})


    # d = d.topic_percentile_score_filter(topic_id=61, min_percentile_score=80)
    # d = d.filter(term_filter='childhood')

    # # Create two sub-datasets, one for female authors and one for male authors
    # c1 = d.copy().filter(term_filter={'term': 'gender', 'min_count': 10})
    # c2 = d.copy().filter(term_filter={'term': 'women', 'min_count': 10})
    #
    # c1 = d.copy().topic_score_filter(71, min_percentile_score=90)
    # c2 = d.copy().topic_score_filter(71, max_percentile_score=89)
    #
    # c1 = d.copy().filter(term_filter='gay')
    # c2 = d.copy().filter(term_filter='not_gay')
Example #7
0
def get_co_use_data(topic_id=61):

    percentile_cutoff = 90

    d_all = JournalsDataset()
    d_all.topic_score_filter(topic_id=topic_id,
                             min_percentile_score=percentile_cutoff)

    output_data = {
        d_all.topics[i]['name']: {}
        for i in range(1, 91)
        if not d_all.topics[i]['name'].startswith('Noise')
    }

    print(len(output_data))

    for start_year in [1960, 1970, 1980, 1990, 2000]:

        print('\n\n', start_year)

        d = JournalsDataset()
        d.filter(start_year=start_year, end_year=start_year + 9)

        doc_topic_matrix = d.get_document_topic_matrix()

        co_use_matrix = doc_topic_matrix.T * doc_topic_matrix

        topic_selector = [f'topic.{i}' for i in range(1, 91)]
        topic_df = d.df[topic_selector]

        for idx, (topic, correlation) in enumerate(
                topic_df.corr()[f'topic.{topic_id}'].sort_values(
                    ascending=False).iteritems()):

            if idx == 0:
                continue

            if correlation > 0.03:
                tid = int(topic[6:])
                # print(tid, d.topics[tid]['name'], correlation)

        print("\nco use")
        co_use_vector = np.array(co_use_matrix[:, topic_id -
                                               1].todense()).flatten()
        for co_use_id in co_use_vector.argsort()[::-1][:10]:
            co_use_topic_id = co_use_id + 1
            # print(co_use_id, d.topics[co_use_topic_id]['name'],
            #       co_use_matrix[topic_id - 1, co_use_id] / sum(co_use_vector))

        from scipy.stats import entropy
        # co_use_vector[topic_id - 1] = 0
        print("\nentropy", entropy(co_use_vector / sum(co_use_vector)))

        number_of_journals_in_decade = len(JournalsDataset().filter(
            start_year=start_year, end_year=start_year + 9))
        d_all_in_years = d_all.copy().filter(start_year=start_year,
                                             end_year=start_year + 9)
        intersections = {}
        for i in range(1, 91):
            overlapping_articles = len(
                d_all_in_years.copy().topic_score_filter(
                    topic_id=i, min_percentile_score=90))
            intersections[
                i] = overlapping_articles / number_of_journals_in_decade

        # print("\noverlap:")
        # for topic_id, overlap in sorted(intersections.items(), key=lambda x:x[1], reverse=True)[:10]:
        #     print(topic_id, d.topics[topic_id]['name'], overlap)

        print("\nslice")
        topic_weights = {}
        for topic_id in range(1, 91):
            if d.topics[topic_id]['name'].startswith('Noise'):
                continue
            topic_weights[topic_id] = d_all_in_years.df[
                f'topic.{topic_id}'].mean()

        for topic_id, slice in sorted(topic_weights.items(),
                                      key=lambda x: x[1],
                                      reverse=True):
            topic_name = d.topics[topic_id]['name']
            output_data[topic_name][start_year] = slice / sum(
                topic_weights.values())
            # print(topic_id, topic_name, slice / sum(topic_weights.values()))

        import matplotlib.pyplot
        s = sorted(topic_weights.items(), key=lambda x: x[1], reverse=True)
        labels = [i[0] for i in s]
        sizes = [i[1] / sum(topic_weights.values()) for i in s]
        fig1, ax1 = plt.subplots()
        ax1.pie(sizes,
                labels=labels,
                autopct='%1.1f%%',
                shadow=True,
                startangle=90)
        ax1.axis('equal'
                 )  # Equal aspect ratio ensures that pie is drawn as a circle.
        plt.title(f'{start_year}, {percentile_cutoff}th percentile')
        plt.show()

    sorted_output_data = sorted(output_data.items(),
                                key=lambda x: x[1][1990],
                                reverse=True)
    df = pd.DataFrame()
    df['labels'] = [i[0] for i in sorted_output_data]
    for year in [1960, 1970, 1980, 1990, 2000]:
        df[year] = [i[1][year] for i in sorted_output_data]

    df.to_csv()

    embed()