def overall_analyses(): for dataset_name in ['journals', 'dissertations']: for analysis_type in ['topics', 'terms']: print('\n\n\n', dataset_name, analysis_type) if dataset_name == 'journals': d = JournalsDataset() else: d = DissertationDataset() # Create two sub-datasets, one for female authors and one for male authors c1 = d.copy().filter(author_gender='female') c2 = d.copy().filter(author_gender='male') # Run the divergence analysis div = DivergenceAnalysis(d, c1, c2, sub_corpus1_name='women', sub_corpus2_name='men', analysis_type=analysis_type, sort_by='dunning') div.run_divergence_analysis(number_of_terms_or_topics_to_print=12)
def get_distinctive_terms_for_correlated_topics(topic_id, correlated_topics_list): d = JournalsDataset() d.topic_score_filter(topic_id=topic_id, min_percentile_score=95) for cor_topic_id in correlated_topics_list: c = d.copy().topic_score_filter(topic_id=cor_topic_id, min_percentile_score=95) print(cor_topic_id, len(c.df)) div = DivergenceAnalysis(d, d, c, analysis_type='terms') div.run_divergence_analysis()
def topics_and_descendants(): d = DissertationDataset() d.filter(start_year=1980, end_year=1999) c1 = d.copy().filter(has_descendants=True) c2 = d.copy().filter(has_descendants=False) div = DivergenceAnalysis(d, c1, c2, sub_corpus1_name='has descendants', sub_corpus2_name='no descendants', analysis_type='topics', sort_by='frequency_score') div.run_divergence_analysis(number_of_terms_or_topics_to_print=10)
def topics_and_descendants_overall(): d = DissertationDataset() d.filter(start_year=1990, end_year=2015) c1 = d.copy().filter(advisor_gender='female') c2 = d.copy().filter(advisor_gender='male') div = DivergenceAnalysis(d, c1, c2, sub_corpus1_name='female advisor', sub_corpus2_name='male advisor', analysis_type='topics', sort_by='dunning') div.run_divergence_analysis(number_of_terms_or_topics_to_print=10)
def analysis_term_gender(): d = JournalsDataset() d.filter(term_filter={'term': 'gender', 'min_count': 10}) c1 = d.copy().filter(author_gender='male') c2 = d.copy().filter(author_gender='female') print(len(c1), len(c2), len(d)) # Run the divergence analysis div = DivergenceAnalysis(d, c1, c2, sub_corpus1_name='male', sub_corpus2_name='female', analysis_type='terms', sort_by='frequency_score', compare_to_overall_weights=False, use_default_vocabulary=False) div.run_divergence_analysis(number_of_terms_or_topics_to_print=20)
def analysis_nazi_history(): dataset_name = 'journals' analysis_type = 'topics' d = JournalsDataset() compare_to_overall_weights = True # retain only the articles scoring in the top 5% for topic 29 (Nazi Germany) d.topic_score_filter(29, min_percentile_score=95) # Create two sub-datasets, one for female authors and one for male authors c1 = d.copy().filter(author_gender='female') c2 = d.copy().filter(author_gender='male') div = DivergenceAnalysis( d, c1, c2, sub_corpus1_name='women', sub_corpus2_name='men', analysis_type=analysis_type, sort_by='dunning', compare_to_overall_weights=compare_to_overall_weights) div.run_divergence_analysis(number_of_terms_or_topics_to_print=10) div.print_articles_for_top_topics(top_terms_or_topics=10, articles_per_term_or_topic=5)
def get_distinctive_terms_for_intersection(intersection, default_dtm): vocabulary = DivergenceAnalysis.get_default_vocabulary() intersection_dtm, _ = intersection.get_vocabulary_and_document_term_matrix( vocabulary=vocabulary) s = StatisticalAnalysis(None, intersection_dtm, default_dtm, vocabulary, skip_tf_transform=True) dunnings, _ = s.dunning_log_likelihood() distinctive_terms = [] for idx in np.argsort(dunnings)[::-1][:10]: distinctive_terms.append(vocabulary[idx]) return distinctive_terms
def analysis_term_freud(): d = JournalsDataset() d.topic_score_filter(topic_id=71, min_percentile_score=90) c1 = d.copy().filter(end_year=1989) c2 = d.copy().filter(start_year=1990) print(len(c1), len(c2), len(d)) # Run the divergence analysis div = DivergenceAnalysis(d, c1, c2, sub_corpus1_name='early', sub_corpus2_name='late', analysis_type='terms', sort_by='dunning', compare_to_overall_weights=False, use_default_vocabulary=False) div.run_divergence_analysis(number_of_terms_or_topics_to_print=30) div.print_articles_for_top_topics(top_terms_or_topics=10, articles_per_term_or_topic=5)
def analysis_sexuality_time_and_gender(): d = JournalsDataset() # d.filter(term_filter={'term': '[fF]reud', 'min_count': 2}) c1 = d.copy().filter(author_gender='male') c2 = d.copy().filter(author_gender='female') print(len(c1), len(c2), len(d)) # Run the divergence analysis div = DivergenceAnalysis(d, c1, c2, sub_corpus1_name='men early', sub_corpus2_name='women late', analysis_type='terms', sort_by='dunning', compare_to_overall_weights=False, use_default_vocabulary=True) div.run_divergence_analysis(number_of_terms_or_topics_to_print=500) div.print_articles_for_top_topics(top_terms_or_topics=10, articles_per_term_or_topic=10)
def analysis_military_history(): for dataset_name in ['journals']: for analysis_type in ['topics']: print('\n\n\n', dataset_name, analysis_type) if dataset_name == 'journals': d = JournalsDataset() else: d = DissertationDataset() if analysis_type == 'topics': compare_to_overall_weights = True else: compare_to_overall_weights = False # retain only the articles scoring in the top 10% for topic 31 (military history) d.topic_score_filter(31, min_percentile_score=90) # Create two sub-datasets, one for female authors and one for male authors c1 = d.copy().filter(author_gender='female') c2 = d.copy().filter(author_gender='male') div = DivergenceAnalysis( d, c1, c2, sub_corpus1_name='women', sub_corpus2_name='men', analysis_type=analysis_type, sort_by='dunning', compare_to_overall_weights=compare_to_overall_weights) div.run_divergence_analysis(number_of_terms_or_topics_to_print=10) div.print_articles_for_top_topics(top_terms_or_topics=10, articles_per_term_or_topic=5)
def draw_gender_frequency_scatterplot(dataset: Dataset, figsize: int, filename: str = None, show_labels: bool = True, transparent_image: bool = False, title: str = None, dynamic_y_coords: bool = False, show_plot: bool = True, use_absolute_weights: bool = False): """ dynamic_y_coords: default (False) uses 0.001 to 0.1. With dynamic y_coords, they are adjusted by local min/max :param figsize: :param filename: :param show_labels: :param transparent_image: :param use_absolute_weights: normally, this chart uses frequencies for men and women as if they had published the same number of articles. With use_absolute_weights, it displays the absolute weight contributed by men and women (which skews the chart heavily towards men). :return: """ c1 = dataset.copy().filter(author_gender='female') c2 = dataset.copy().filter(author_gender='male') div = DivergenceAnalysis(dataset, c1, c2, sub_corpus1_name='female', sub_corpus2_name='male', analysis_type='topics', sort_by='dunning') divergence_df = div.run_divergence_analysis() fig = plt.figure(figsize=(figsize, figsize)) gs = gridspec.GridSpec(nrows=2, ncols=1, figure=fig, width_ratios=[1], height_ratios=[1, 0.1], wspace=0.2, hspace=0.1) ax = fig.add_subplot(gs[0, 0]) x_coords = [] y_coords = [] x_coords_gender = [] y_coords_gender = [] x_coords_female_assoc = [] y_coords_female_assoc = [] for topic_id in range(1, 91): gen_approach = dataset.topics[topic_id]['gen_approach'] if isinstance(gen_approach, str) and gen_approach.find('Noise') > -1: continue x = divergence_df[divergence_df['topic_id'] == topic_id]['frequency_score'].values[0] y = dataset.df[f'topic.{topic_id}'].mean() if use_absolute_weights: weight_female = divergence_df[divergence_df['topic_id'] == topic_id]['f female'].values[0] weight_both = divergence_df[divergence_df['topic_id'] == topic_id]['freq both'].values[0] x = weight_female * len(c1) / (weight_both * (len(c1) + len(c2))) if y < 0.0012: continue if topic_id in {46, 61, 71}: x_coords_gender.append(x) y_coords_gender.append(y) elif topic_id in {32, 76}: x_coords_female_assoc.append(x) y_coords_female_assoc.append(y) else: x_coords.append(x) y_coords.append(y) topic_name = dataset.topics[topic_id]['name'] if show_labels: ax.annotate(topic_name, (x, y + 0.000)) # ax.set_ylim(0, 1) # ax.set_xlim(dataset.start_year + 2, dataset.end_year - 2) ax.set_axisbelow(True) ax.grid(which='major', axis='both') # # Set the values used for colormapping # normalized_cmap = matplotlib.colors.Normalize(vmin=min(color_codes), # vmax=max(color_codes)) # ax.scatter(x_coords, y_coords, s=300) ax.set_xlim(0, 1) # y axis # ax.set_ylabel('Topic Weight') # ax.label_params(labelsize=20) if dynamic_y_coords: ax.set_ylim(min(y_coords) * 0.9, max(y_coords) * 1.1) else: ax.set_ylim(0.001, 0.15) ax.set_yscale('log') ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0, decimals=2)) ax.set_yticks([0.001, 0.01, 0.1]) ax.grid(b=True, which='minor', color='lightgray', linestyle='--') ax.tick_params(labelsize=15) # ax.yaxis.set_minor_locator(MultipleLocator(5)) prop_cycle = plt.rcParams['axes.prop_cycle'] colors = prop_cycle.by_key()['color'] ax.scatter(x_coords, y_coords, s=200, c=colors[7]) ax.scatter(x_coords_gender, y_coords_gender, s=200, c=colors[1]) ax.scatter(x_coords_female_assoc, y_coords_female_assoc, s=200, c='#fdbf6f') male_data, female_data, years = get_number_of_male_and_female_authored_articles_by_year( start_year=dataset.start_year, end_year=dataset.end_year + 1, dataset_name=dataset.dataset_type) gender_ax = fig.add_subplot(gs[1, 0]) gender_ax.stackplot(years, female_data, male_data, colors=(colors[1], colors[0])) gender_ax.margins(0, 0) gender_ax.tick_params(labelsize=15) gender_ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0)) gender_ax.set_yticks([np.round(female_data[0], 2)]) x_ticks = sorted({y for y in years if y % 5 == 0}) # .union({min(years)}) # .union({max(years)})) gender_ax.set_xticks(x_ticks) if title: ax.set_title(label=title, weight='bold', fontsize=24, pad=20) if figsize == 36: dpi = 300 else: dpi = 300 if filename: plt.savefig(Path(BASE_PATH, 'visualizations', 'gender_frequency_scatterplots', filename), dpi=dpi, transparent=transparent_image) if show_plot: plt.show()
def draw_gender_frequency_scatterplot(dataset: Dataset, figsize: int, filename: str = None, show_labels: bool = True, transparent_image: bool = False, dynamic_y_coords: bool = False): """ dynamic_y_coords: default (False) uses 0.001 to 0.1. With dynamic y_coords, they are adjusted by local min/max :param figsize: :param filename: :param show_labels: :param transparent_image: :return: """ c1 = dataset.copy().filter(author_gender='female') c2 = dataset.copy().filter(author_gender='male') div = DivergenceAnalysis(dataset, c1, c2, analysis_type='topics') divergence_df = div.run_divergence_analysis(print_results=False) df_sorted_by_years = dataset.df.sort_values(by='m_year') fig = plt.figure(figsize=(figsize, figsize)) gs = gridspec.GridSpec(nrows=1, ncols=1, figure=fig, width_ratios=[1], height_ratios=[1], wspace=0.2, hspace=0.05) ax = fig.add_subplot(gs[0, 0]) x_coords = [] y_coords = [] color_codes = [] for topic_id in range(1, 91): gen_approach = dataset.topics[topic_id]['gen_approach'] if isinstance(gen_approach, str) and gen_approach.find('Noise') > -1: continue # embed() x = divergence_df[divergence_df.topic_id == topic_id].iloc[0].frequency_score x_coords.append(x) y = dataset.df[f'topic.{topic_id}'].mean() y_coords.append(y) topic_array_sorted_by_year = np.array( df_sorted_by_years[f'topic.{topic_id}']) total_topic_weight = topic_array_sorted_by_year.sum() topic_weight_so_far = 0 median_year = None for idx, article_topic_weight in enumerate(topic_array_sorted_by_year): if topic_weight_so_far > total_topic_weight / 2: median_year = df_sorted_by_years.iloc[idx]['m_year'] break topic_weight_so_far += article_topic_weight color_codes.append(median_year) topic_name = dataset.topics[topic_id]['name'] if show_labels: ax.annotate(topic_name, (x, y + 0.000)) ax.set_ylim(0, 1) ax.set_xlim(dataset.start_year + 2, dataset.end_year - 2) ax.set_axisbelow(True) ax.grid(which='major', axis='both') # Set the values used for colormapping normalized_cmap = matplotlib.colors.Normalize(vmin=min(color_codes), vmax=max(color_codes)) # ax.scatter(x_coords, y_coords, s=300) ax.set_xlim(0, 1) # y axis # ax.set_ylabel('Topic Weight') # ax.label_params(labelsize=20) if dynamic_y_coords: ax.set_ylim(min(y_coords) * 0.9, max(y_coords) * 1.1) else: ax.set_ylim(0.002, 0.1) ax.set_yscale('log') ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0)) ax.grid(b=True, which='minor', color='lightgray', linestyle='--') ax.tick_params(labelsize=15) # ax.yaxis.set_minor_locator(MultipleLocator(5)) # ax.scatter(x_coords, y_coords, c=color_codes, s=200, cmap='jet', # norm=normalized_cmap) ax.scatter(x_coords, y_coords, facecolors='#1f77b4', edgecolors='black', s=200) # 1f77b4 # lc = LineCollection([], cmap='jet', norm=plt.Normalize(0.0, 1.0)) # cbar_ax = fig.add_subplot(gs[0, 1]) # cbar = fig.colorbar(lc, # cax=cbar_ax, # ticks = [0.0, 0.50, 1.0], # fraction=0.03) # cbar.ax.set_yticklabels([ # min(color_codes), # int(np.median(np.array(color_codes))), # max(color_codes) # ]) # cbar.ax.tick_params(labelsize=14) if figsize == 36: dpi = 300 else: dpi = 300 if filename: plt.savefig(Path(BASE_PATH, 'visualizations', 'gender_frequency_scatterplots', filename), dpi=dpi, transparent=transparent_image) plt.show()
def get_edge_data(): topic_intersections_path = Path(BASE_PATH, 'data', 'react_data', 'topic_edges.json') if topic_intersections_path.exists(): with open(topic_intersections_path, 'r') as infile: topic_intersections = json.load(infile) else: d = JournalsDataset() vocabulary = DivergenceAnalysis.get_default_vocabulary() default_dtm, _ = d.get_vocabulary_and_document_term_matrix( vocabulary=vocabulary) c1 = d.copy().filter(author_gender='male') c2 = d.copy().filter(author_gender='female') topic_dtm = d.get_document_topic_matrix() c1_topic_dtm = c1.get_document_topic_matrix() c2_topic_dtm = c2.get_document_topic_matrix() stat = StatisticalAnalysis( topic_dtm, c1_topic_dtm, c2_topic_dtm, vocabulary=[f'topic.{i}' for i in range(1, 91)]) correlations = stat.correlation_coefficient( return_correlation_matrix=True) # adjust female counts by 4.72 female_adj = 1 / (len(c2) / (len(c1) + len(c2))) topic_intersections = defaultdict() for topic1_id in range(1, 91): topic_intersections[topic1_id] = defaultdict() print(topic1_id) for topic2_id in range(1, 91): print(topic1_id, topic2_id) if topic1_id == topic2_id: continue topic1_name = d.topics[topic1_id]['name'] topic2_name = d.topics[topic2_id]['name'] if topic1_name.startswith('Noise') or topic2_name.startswith( 'Noise'): continue intersection = d.copy()\ .topic_score_filter(topic_id=topic1_id, min_percentile_score=90)\ .topic_score_filter(topic_id=topic2_id, min_percentile_score=90) count_male = len(intersection.df[ intersection.df.m_author_genders == 'male']) count_female = len(intersection.df[ intersection.df.m_author_genders == 'female']) adj_count_female = count_female * female_adj topic_intersections[topic1_id][topic2_id] = { 'number': len(intersection), 'gender_balance': adj_count_female / (count_male + adj_count_female), 'correlation': correlations[topic1_id - 1, topic2_id - 1], 'distinctive_terms': get_distinctive_terms_for_intersection( intersection, default_dtm) } with open(topic_intersections_path, 'w') as outfile: json.dump(topic_intersections, outfile, indent=4) return get_edge_data() all_intersections = [] for i in range(1, 91): for j in range(1, 91): try: all_intersections.append( topic_intersections[str(i)][str(j)]['correlation']) except KeyError: pass percentile_99 = np.percentile(all_intersections, 99) percentile_95 = np.percentile(all_intersections, 95) print(f"99%: {percentile_99}. 95%: {percentile_95}") return topic_intersections
def get_topic_data(): topic_data_path = Path(BASE_PATH, 'data', 'react_data', 'topic_data.json') if topic_data_path.exists(): with open(topic_data_path, 'r') as infile: return json.load(infile) else: topic_data = {} d = JournalsDataset() c1 = d.copy().filter(author_gender='male') c2 = d.copy().filter(author_gender='female') vocabulary = DivergenceAnalysis.get_default_vocabulary() default_dtm, _ = d.get_vocabulary_and_document_term_matrix( vocabulary=vocabulary) div = DivergenceAnalysis(master_corpus=d, sub_corpus1=c1, sub_corpus2=c2, analysis_type='topics') div_df = div.run_divergence_analysis(print_results=False) for topic_id in range(1, 91): print("\n\nTopic", topic_id) topic_name = d.topics[topic_id]['name'] if topic_name.startswith('Noise'): continue div_row = div_df[div_df.topic_id == topic_id].iloc[0] embed() c1t = d.copy().topic_score_filter(topic_id=topic_id, min_percentile_score=95) topic_data[topic_id] = { 'name': topic_name, 'topic_id': topic_id, 'dunning': div_row.dunning, 'freq_score': div_row.frequency_score, 'frequency': div_row['freq both'], 'terms_prob': d.topics[topic_id]['terms_prob'][:10], 'terms_frex': d.topics[topic_id]['terms_frex'][:10], 'terms_dunning': get_distinctive_terms_for_intersection(c1t, default_dtm) } print("\n\n\nFinished with topics") with open(topic_data_path, 'w') as outfile: json.dump(topic_data, outfile, indent=4) return get_topic_data()