Exemple #1
0
def overall_analyses():

    for dataset_name in ['journals', 'dissertations']:
        for analysis_type in ['topics', 'terms']:

            print('\n\n\n', dataset_name, analysis_type)

            if dataset_name == 'journals':
                d = JournalsDataset()
            else:
                d = DissertationDataset()

            # Create two sub-datasets, one for female authors and one for male authors
            c1 = d.copy().filter(author_gender='female')
            c2 = d.copy().filter(author_gender='male')

            # Run the divergence analysis
            div = DivergenceAnalysis(d,
                                     c1,
                                     c2,
                                     sub_corpus1_name='women',
                                     sub_corpus2_name='men',
                                     analysis_type=analysis_type,
                                     sort_by='dunning')
            div.run_divergence_analysis(number_of_terms_or_topics_to_print=12)
Exemple #2
0
def get_distinctive_terms_for_correlated_topics(topic_id,
                                                correlated_topics_list):

    d = JournalsDataset()
    d.topic_score_filter(topic_id=topic_id, min_percentile_score=95)

    for cor_topic_id in correlated_topics_list:
        c = d.copy().topic_score_filter(topic_id=cor_topic_id,
                                        min_percentile_score=95)
        print(cor_topic_id, len(c.df))

        div = DivergenceAnalysis(d, d, c, analysis_type='terms')
        div.run_divergence_analysis()
def topics_and_descendants():

    d = DissertationDataset()
    d.filter(start_year=1980, end_year=1999)

    c1 = d.copy().filter(has_descendants=True)
    c2 = d.copy().filter(has_descendants=False)

    div = DivergenceAnalysis(d,
                             c1,
                             c2,
                             sub_corpus1_name='has descendants',
                             sub_corpus2_name='no descendants',
                             analysis_type='topics',
                             sort_by='frequency_score')
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=10)
Exemple #4
0
def topics_and_descendants_overall():

    d = DissertationDataset()
    d.filter(start_year=1990, end_year=2015)

    c1 = d.copy().filter(advisor_gender='female')
    c2 = d.copy().filter(advisor_gender='male')

    div = DivergenceAnalysis(d,
                             c1,
                             c2,
                             sub_corpus1_name='female advisor',
                             sub_corpus2_name='male advisor',
                             analysis_type='topics',
                             sort_by='dunning')
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=10)
Exemple #5
0
def analysis_term_gender():

    d = JournalsDataset()
    d.filter(term_filter={'term': 'gender', 'min_count': 10})
    c1 = d.copy().filter(author_gender='male')
    c2 = d.copy().filter(author_gender='female')

    print(len(c1), len(c2), len(d))

    # Run the divergence analysis
    div = DivergenceAnalysis(d,
                             c1,
                             c2,
                             sub_corpus1_name='male',
                             sub_corpus2_name='female',
                             analysis_type='terms',
                             sort_by='frequency_score',
                             compare_to_overall_weights=False,
                             use_default_vocabulary=False)
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=20)
Exemple #6
0
def analysis_nazi_history():

    dataset_name = 'journals'
    analysis_type = 'topics'

    d = JournalsDataset()

    compare_to_overall_weights = True

    # retain only the articles scoring in the top 5% for topic 29 (Nazi Germany)
    d.topic_score_filter(29, min_percentile_score=95)

    # Create two sub-datasets, one for female authors and one for male authors
    c1 = d.copy().filter(author_gender='female')
    c2 = d.copy().filter(author_gender='male')

    div = DivergenceAnalysis(
        d,
        c1,
        c2,
        sub_corpus1_name='women',
        sub_corpus2_name='men',
        analysis_type=analysis_type,
        sort_by='dunning',
        compare_to_overall_weights=compare_to_overall_weights)
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=10)

    div.print_articles_for_top_topics(top_terms_or_topics=10,
                                      articles_per_term_or_topic=5)
Exemple #7
0
def get_distinctive_terms_for_intersection(intersection, default_dtm):

    vocabulary = DivergenceAnalysis.get_default_vocabulary()
    intersection_dtm, _ = intersection.get_vocabulary_and_document_term_matrix(
        vocabulary=vocabulary)

    s = StatisticalAnalysis(None,
                            intersection_dtm,
                            default_dtm,
                            vocabulary,
                            skip_tf_transform=True)
    dunnings, _ = s.dunning_log_likelihood()

    distinctive_terms = []
    for idx in np.argsort(dunnings)[::-1][:10]:
        distinctive_terms.append(vocabulary[idx])

    return distinctive_terms
Exemple #8
0
def analysis_term_freud():

    d = JournalsDataset()
    d.topic_score_filter(topic_id=71, min_percentile_score=90)
    c1 = d.copy().filter(end_year=1989)
    c2 = d.copy().filter(start_year=1990)

    print(len(c1), len(c2), len(d))

    # Run the divergence analysis
    div = DivergenceAnalysis(d,
                             c1,
                             c2,
                             sub_corpus1_name='early',
                             sub_corpus2_name='late',
                             analysis_type='terms',
                             sort_by='dunning',
                             compare_to_overall_weights=False,
                             use_default_vocabulary=False)
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=30)
    div.print_articles_for_top_topics(top_terms_or_topics=10,
                                      articles_per_term_or_topic=5)
Exemple #9
0
def analysis_sexuality_time_and_gender():

    d = JournalsDataset()
    # d.filter(term_filter={'term': '[fF]reud', 'min_count': 2})
    c1 = d.copy().filter(author_gender='male')
    c2 = d.copy().filter(author_gender='female')

    print(len(c1), len(c2), len(d))

    # Run the divergence analysis
    div = DivergenceAnalysis(d,
                             c1,
                             c2,
                             sub_corpus1_name='men early',
                             sub_corpus2_name='women late',
                             analysis_type='terms',
                             sort_by='dunning',
                             compare_to_overall_weights=False,
                             use_default_vocabulary=True)
    div.run_divergence_analysis(number_of_terms_or_topics_to_print=500)

    div.print_articles_for_top_topics(top_terms_or_topics=10,
                                      articles_per_term_or_topic=10)
Exemple #10
0
def analysis_military_history():

    for dataset_name in ['journals']:
        for analysis_type in ['topics']:

            print('\n\n\n', dataset_name, analysis_type)

            if dataset_name == 'journals':
                d = JournalsDataset()
            else:
                d = DissertationDataset()

            if analysis_type == 'topics':
                compare_to_overall_weights = True
            else:
                compare_to_overall_weights = False

            # retain only the articles scoring in the top 10% for topic 31 (military history)
            d.topic_score_filter(31, min_percentile_score=90)

            # Create two sub-datasets, one for female authors and one for male authors
            c1 = d.copy().filter(author_gender='female')
            c2 = d.copy().filter(author_gender='male')

            div = DivergenceAnalysis(
                d,
                c1,
                c2,
                sub_corpus1_name='women',
                sub_corpus2_name='men',
                analysis_type=analysis_type,
                sort_by='dunning',
                compare_to_overall_weights=compare_to_overall_weights)
            div.run_divergence_analysis(number_of_terms_or_topics_to_print=10)

            div.print_articles_for_top_topics(top_terms_or_topics=10,
                                              articles_per_term_or_topic=5)
Exemple #11
0
def draw_gender_frequency_scatterplot(dataset: Dataset,
                                      figsize: int,
                                      filename: str = None,
                                      show_labels: bool = True,
                                      transparent_image: bool = False,
                                      title: str = None,
                                      dynamic_y_coords: bool = False,
                                      show_plot: bool = True,
                                      use_absolute_weights: bool = False):
    """

    dynamic_y_coords: default (False) uses 0.001 to 0.1. With dynamic y_coords, they are adjusted
    by local min/max

    :param figsize:
    :param filename:
    :param show_labels:
    :param transparent_image:

    :param use_absolute_weights: normally, this chart uses frequencies for men and women as if
    they had published the same number of articles. With use_absolute_weights, it displays the
    absolute weight contributed by men and women (which skews the chart heavily towards men).

    :return:
    """

    c1 = dataset.copy().filter(author_gender='female')
    c2 = dataset.copy().filter(author_gender='male')
    div = DivergenceAnalysis(dataset,
                             c1,
                             c2,
                             sub_corpus1_name='female',
                             sub_corpus2_name='male',
                             analysis_type='topics',
                             sort_by='dunning')
    divergence_df = div.run_divergence_analysis()

    fig = plt.figure(figsize=(figsize, figsize))
    gs = gridspec.GridSpec(nrows=2,
                           ncols=1,
                           figure=fig,
                           width_ratios=[1],
                           height_ratios=[1, 0.1],
                           wspace=0.2,
                           hspace=0.1)

    ax = fig.add_subplot(gs[0, 0])

    x_coords = []
    y_coords = []

    x_coords_gender = []
    y_coords_gender = []

    x_coords_female_assoc = []
    y_coords_female_assoc = []

    for topic_id in range(1, 91):

        gen_approach = dataset.topics[topic_id]['gen_approach']
        if isinstance(gen_approach, str) and gen_approach.find('Noise') > -1:
            continue

        x = divergence_df[divergence_df['topic_id'] ==
                          topic_id]['frequency_score'].values[0]
        y = dataset.df[f'topic.{topic_id}'].mean()

        if use_absolute_weights:
            weight_female = divergence_df[divergence_df['topic_id'] ==
                                          topic_id]['f female'].values[0]
            weight_both = divergence_df[divergence_df['topic_id'] ==
                                        topic_id]['freq both'].values[0]
            x = weight_female * len(c1) / (weight_both * (len(c1) + len(c2)))

        if y < 0.0012:
            continue

        if topic_id in {46, 61, 71}:
            x_coords_gender.append(x)
            y_coords_gender.append(y)
        elif topic_id in {32, 76}:
            x_coords_female_assoc.append(x)
            y_coords_female_assoc.append(y)
        else:
            x_coords.append(x)
            y_coords.append(y)

        topic_name = dataset.topics[topic_id]['name']

        if show_labels:
            ax.annotate(topic_name, (x, y + 0.000))

    # ax.set_ylim(0, 1)
    # ax.set_xlim(dataset.start_year + 2, dataset.end_year - 2)
    ax.set_axisbelow(True)
    ax.grid(which='major', axis='both')

    # # Set the values used for colormapping
    # normalized_cmap = matplotlib.colors.Normalize(vmin=min(color_codes),
    #                                               vmax=max(color_codes))

    # ax.scatter(x_coords, y_coords, s=300)
    ax.set_xlim(0, 1)

    # y axis
    # ax.set_ylabel('Topic Weight')
    # ax.label_params(labelsize=20)

    if dynamic_y_coords:
        ax.set_ylim(min(y_coords) * 0.9, max(y_coords) * 1.1)
    else:
        ax.set_ylim(0.001, 0.15)

    ax.set_yscale('log')
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0, decimals=2))
    ax.set_yticks([0.001, 0.01, 0.1])
    ax.grid(b=True, which='minor', color='lightgray', linestyle='--')
    ax.tick_params(labelsize=15)

    # ax.yaxis.set_minor_locator(MultipleLocator(5))

    prop_cycle = plt.rcParams['axes.prop_cycle']
    colors = prop_cycle.by_key()['color']

    ax.scatter(x_coords, y_coords, s=200, c=colors[7])
    ax.scatter(x_coords_gender, y_coords_gender, s=200, c=colors[1])
    ax.scatter(x_coords_female_assoc,
               y_coords_female_assoc,
               s=200,
               c='#fdbf6f')

    male_data, female_data, years = get_number_of_male_and_female_authored_articles_by_year(
        start_year=dataset.start_year,
        end_year=dataset.end_year + 1,
        dataset_name=dataset.dataset_type)

    gender_ax = fig.add_subplot(gs[1, 0])
    gender_ax.stackplot(years,
                        female_data,
                        male_data,
                        colors=(colors[1], colors[0]))
    gender_ax.margins(0, 0)
    gender_ax.tick_params(labelsize=15)
    gender_ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0))
    gender_ax.set_yticks([np.round(female_data[0], 2)])

    x_ticks = sorted({y for y in years if y % 5 == 0})
    # .union({min(years)})
    # .union({max(years)}))
    gender_ax.set_xticks(x_ticks)

    if title:
        ax.set_title(label=title, weight='bold', fontsize=24, pad=20)

    if figsize == 36:
        dpi = 300
    else:
        dpi = 300

    if filename:
        plt.savefig(Path(BASE_PATH, 'visualizations',
                         'gender_frequency_scatterplots', filename),
                    dpi=dpi,
                    transparent=transparent_image)
    if show_plot:
        plt.show()
Exemple #12
0
def draw_gender_frequency_scatterplot(dataset: Dataset,
                                      figsize: int,
                                      filename: str = None,
                                      show_labels: bool = True,
                                      transparent_image: bool = False,
                                      dynamic_y_coords: bool = False):
    """

    dynamic_y_coords: default (False) uses 0.001 to 0.1. With dynamic y_coords, they are adjusted
    by local min/max

    :param figsize:
    :param filename:
    :param show_labels:
    :param transparent_image:
    :return:
    """

    c1 = dataset.copy().filter(author_gender='female')
    c2 = dataset.copy().filter(author_gender='male')
    div = DivergenceAnalysis(dataset, c1, c2, analysis_type='topics')
    divergence_df = div.run_divergence_analysis(print_results=False)

    df_sorted_by_years = dataset.df.sort_values(by='m_year')

    fig = plt.figure(figsize=(figsize, figsize))
    gs = gridspec.GridSpec(nrows=1,
                           ncols=1,
                           figure=fig,
                           width_ratios=[1],
                           height_ratios=[1],
                           wspace=0.2,
                           hspace=0.05)

    ax = fig.add_subplot(gs[0, 0])

    x_coords = []
    y_coords = []
    color_codes = []

    for topic_id in range(1, 91):

        gen_approach = dataset.topics[topic_id]['gen_approach']
        if isinstance(gen_approach, str) and gen_approach.find('Noise') > -1:
            continue

        # embed()
        x = divergence_df[divergence_df.topic_id ==
                          topic_id].iloc[0].frequency_score
        x_coords.append(x)
        y = dataset.df[f'topic.{topic_id}'].mean()
        y_coords.append(y)

        topic_array_sorted_by_year = np.array(
            df_sorted_by_years[f'topic.{topic_id}'])
        total_topic_weight = topic_array_sorted_by_year.sum()
        topic_weight_so_far = 0
        median_year = None
        for idx, article_topic_weight in enumerate(topic_array_sorted_by_year):
            if topic_weight_so_far > total_topic_weight / 2:
                median_year = df_sorted_by_years.iloc[idx]['m_year']
                break
            topic_weight_so_far += article_topic_weight

        color_codes.append(median_year)

        topic_name = dataset.topics[topic_id]['name']

        if show_labels:
            ax.annotate(topic_name, (x, y + 0.000))

    ax.set_ylim(0, 1)
    ax.set_xlim(dataset.start_year + 2, dataset.end_year - 2)
    ax.set_axisbelow(True)
    ax.grid(which='major', axis='both')

    # Set the values used for colormapping
    normalized_cmap = matplotlib.colors.Normalize(vmin=min(color_codes),
                                                  vmax=max(color_codes))

    # ax.scatter(x_coords, y_coords, s=300)
    ax.set_xlim(0, 1)

    # y axis
    # ax.set_ylabel('Topic Weight')
    # ax.label_params(labelsize=20)

    if dynamic_y_coords:
        ax.set_ylim(min(y_coords) * 0.9, max(y_coords) * 1.1)
    else:
        ax.set_ylim(0.002, 0.1)

    ax.set_yscale('log')
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0))
    ax.grid(b=True, which='minor', color='lightgray', linestyle='--')
    ax.tick_params(labelsize=15)

    # ax.yaxis.set_minor_locator(MultipleLocator(5))

    # ax.scatter(x_coords, y_coords, c=color_codes, s=200, cmap='jet',
    #            norm=normalized_cmap)
    ax.scatter(x_coords,
               y_coords,
               facecolors='#1f77b4',
               edgecolors='black',
               s=200)

    # 1f77b4

    # lc = LineCollection([], cmap='jet', norm=plt.Normalize(0.0, 1.0))

    # cbar_ax = fig.add_subplot(gs[0, 1])
    # cbar = fig.colorbar(lc,
    #                     cax=cbar_ax,
    #                     ticks = [0.0, 0.50, 1.0],
    #                     fraction=0.03)
    # cbar.ax.set_yticklabels([
    #     min(color_codes),
    #     int(np.median(np.array(color_codes))),
    #     max(color_codes)
    # ])
    # cbar.ax.tick_params(labelsize=14)

    if figsize == 36:
        dpi = 300
    else:
        dpi = 300

    if filename:
        plt.savefig(Path(BASE_PATH, 'visualizations',
                         'gender_frequency_scatterplots', filename),
                    dpi=dpi,
                    transparent=transparent_image)
    plt.show()
Exemple #13
0
def get_edge_data():

    topic_intersections_path = Path(BASE_PATH, 'data', 'react_data',
                                    'topic_edges.json')
    if topic_intersections_path.exists():
        with open(topic_intersections_path, 'r') as infile:
            topic_intersections = json.load(infile)
    else:

        d = JournalsDataset()
        vocabulary = DivergenceAnalysis.get_default_vocabulary()
        default_dtm, _ = d.get_vocabulary_and_document_term_matrix(
            vocabulary=vocabulary)

        c1 = d.copy().filter(author_gender='male')
        c2 = d.copy().filter(author_gender='female')
        topic_dtm = d.get_document_topic_matrix()

        c1_topic_dtm = c1.get_document_topic_matrix()
        c2_topic_dtm = c2.get_document_topic_matrix()
        stat = StatisticalAnalysis(
            topic_dtm,
            c1_topic_dtm,
            c2_topic_dtm,
            vocabulary=[f'topic.{i}' for i in range(1, 91)])
        correlations = stat.correlation_coefficient(
            return_correlation_matrix=True)

        # adjust female counts by 4.72
        female_adj = 1 / (len(c2) / (len(c1) + len(c2)))

        topic_intersections = defaultdict()

        for topic1_id in range(1, 91):
            topic_intersections[topic1_id] = defaultdict()

            print(topic1_id)

            for topic2_id in range(1, 91):
                print(topic1_id, topic2_id)

                if topic1_id == topic2_id: continue

                topic1_name = d.topics[topic1_id]['name']
                topic2_name = d.topics[topic2_id]['name']
                if topic1_name.startswith('Noise') or topic2_name.startswith(
                        'Noise'):
                    continue

                intersection = d.copy()\
                    .topic_score_filter(topic_id=topic1_id, min_percentile_score=90)\
                    .topic_score_filter(topic_id=topic2_id, min_percentile_score=90)

                count_male = len(intersection.df[
                    intersection.df.m_author_genders == 'male'])
                count_female = len(intersection.df[
                    intersection.df.m_author_genders == 'female'])
                adj_count_female = count_female * female_adj

                topic_intersections[topic1_id][topic2_id] = {
                    'number':
                    len(intersection),
                    'gender_balance':
                    adj_count_female / (count_male + adj_count_female),
                    'correlation':
                    correlations[topic1_id - 1, topic2_id - 1],
                    'distinctive_terms':
                    get_distinctive_terms_for_intersection(
                        intersection, default_dtm)
                }

        with open(topic_intersections_path, 'w') as outfile:
            json.dump(topic_intersections, outfile, indent=4)

        return get_edge_data()

    all_intersections = []
    for i in range(1, 91):
        for j in range(1, 91):
            try:
                all_intersections.append(
                    topic_intersections[str(i)][str(j)]['correlation'])
            except KeyError:
                pass

    percentile_99 = np.percentile(all_intersections, 99)
    percentile_95 = np.percentile(all_intersections, 95)

    print(f"99%: {percentile_99}. 95%: {percentile_95}")

    return topic_intersections
Exemple #14
0
def get_topic_data():

    topic_data_path = Path(BASE_PATH, 'data', 'react_data', 'topic_data.json')

    if topic_data_path.exists():
        with open(topic_data_path, 'r') as infile:
            return json.load(infile)

    else:

        topic_data = {}

        d = JournalsDataset()
        c1 = d.copy().filter(author_gender='male')
        c2 = d.copy().filter(author_gender='female')

        vocabulary = DivergenceAnalysis.get_default_vocabulary()
        default_dtm, _ = d.get_vocabulary_and_document_term_matrix(
            vocabulary=vocabulary)

        div = DivergenceAnalysis(master_corpus=d,
                                 sub_corpus1=c1,
                                 sub_corpus2=c2,
                                 analysis_type='topics')
        div_df = div.run_divergence_analysis(print_results=False)

        for topic_id in range(1, 91):

            print("\n\nTopic", topic_id)

            topic_name = d.topics[topic_id]['name']
            if topic_name.startswith('Noise'):
                continue

            div_row = div_df[div_df.topic_id == topic_id].iloc[0]

            embed()

            c1t = d.copy().topic_score_filter(topic_id=topic_id,
                                              min_percentile_score=95)

            topic_data[topic_id] = {
                'name':
                topic_name,
                'topic_id':
                topic_id,
                'dunning':
                div_row.dunning,
                'freq_score':
                div_row.frequency_score,
                'frequency':
                div_row['freq both'],
                'terms_prob':
                d.topics[topic_id]['terms_prob'][:10],
                'terms_frex':
                d.topics[topic_id]['terms_frex'][:10],
                'terms_dunning':
                get_distinctive_terms_for_intersection(c1t, default_dtm)
            }

        print("\n\n\nFinished with topics")

        with open(topic_data_path, 'w') as outfile:
            json.dump(topic_data, outfile, indent=4)

        return get_topic_data()