def __init__(self, term_doc_matrix, left_categories, right_categories, top_categories, bottom_categories, left_category_name=None, right_category_name=None, top_category_name=None, bottom_category_name=None, x_scorer=RankDifference(), y_scorer=RankDifference(), term_ranker=AbsoluteFrequencyRanker, labels=None): for param in [left_categories, right_categories, top_categories, bottom_categories]: assert type(param) == list assert set(param) - set(term_doc_matrix.get_categories()) == set() assert len(param) > 0 self.term_doc_matrix_ = term_doc_matrix self._labels = labels self.left_category_name_ = left_category_name if left_category_name is not None else left_categories[0] self.right_category_name_ = right_category_name if right_category_name is not None else right_categories[0] self.top_category_name_ = top_category_name if top_category_name is not None else top_categories[0] self.bottom_category_name_ = bottom_category_name if bottom_category_name is not None else bottom_categories[0] self.x_scorer_ = x_scorer self.y_scorer_ = y_scorer self.term_ranker_ = term_ranker self.left_categories_, self.right_categories_, self.top_categories_, self.bottom_categories_ \ = left_categories, right_categories, top_categories, bottom_categories self.axes = self._build_axes() self.lexicons = self._build_lexicons()
def _build_square(self, term_doc_matrix, term_ranker, labels, scorer): self.term_doc_matrix_ = term_doc_matrix self.term_ranker = term_ranker(term_doc_matrix) self.scorer = RankDifference() \ if scorer is None else scorer self.axes = self._build_axes(scorer) self.lexicons = self._build_lexicons() self._labels = labels
def get_scores(self, corpus): ''' Parameters ---------- corpus Returns ------- float, pd.Series float: point on x-axis at even characteristicness pd.Series: term -> value between 0 and 1, sorted by score in a descending manner Background scores from corpus ''' term_ranks = self.term_ranker(corpus).get_ranks() bg = pd.DataFrame({ 'corpus': term_ranks.sum(axis=1), 'bg': self.background_frequencies.get_background_frequency_df() ['background'] }).dropna() scores = RankDifference().get_scores(bg['corpus'], bg['bg']).sort_values() if scores.min() < 0 and scores.max() > 0: zero_marker = -scores.min() / (scores.max() - scores.min()) elif scores.min() > 0: zero_marker = 0 else: zero_marker = 1 bg['score'] = scale(scores) return zero_marker, bg.sort_values(by='score', ascending=False)['score']
def _get_default_scores(self, category, other_categories, df): category_column_name = category + ' freq' cat_word_counts = df[category_column_name] not_cat_word_counts = df[[c + ' freq' for c in other_categories]].sum(axis=1) # scores = ScaledFScore.get_scores(cat_word_counts, not_cat_word_counts) scores = RankDifference().get_scores(cat_word_counts, not_cat_word_counts) return scores
def __init__(self, corpus, scorer=RankDifference(), ranker=AbsoluteFrequencyRanker, use_non_text_features=False, color_palette=QUALITATIVE_COLORS): ''' Assigns scores to colors for categories :param corpus: TermDocMatrix :param scorer: scorer :param color_palette: list of colors [[red, green, blue], ...] ''' self.corpus = corpus self.scorer = scorer self.color_palette = color_palette my_ranker = ranker(corpus) if use_non_text_features: my_ranker.use_non_text_features() tdf = my_ranker.get_ranks() tdf_sum = tdf.sum(axis=1) term_scores = {} for cat in tdf.columns: term_scores[cat[:-5]] = pd.Series(self.scorer.get_scores( tdf[cat], tdf_sum - tdf[cat]), index=tdf.index) self.term_cat = pd.DataFrame(term_scores).idxmax(axis=1) ranked_list_categories = pd.Series( corpus.get_category_names_by_row()).value_counts().index self.category_colors = pd.Series( self.color_palette[:len(ranked_list_categories)], index=ranked_list_categories)
def get_topics_from_terms(self, terms=None, num_terms_per_topic=10, scorer=RankDifference()): ''' Parameters ---------- terms : list or None If terms is list, make these the seed terms for the topoics If none, use the first 30 terms in get_scaled_f_scores_vs_background num_terms_per_topic : int, default 10 Use this many terms per topic scorer : TermScorer Implements get_scores, default is RankDifferce, which tends to work best Returns ------- dict: {term: [term1, ...], ...} ''' topic_model = {} if terms is None: terms = self.corpus.get_scaled_f_scores_vs_background().index[:30] for term in terms: termidx = self.termidxstore.getidxstrict(term) labels = self.sentX[:, termidx].astype(bool).todense().A1 poscnts = self.sentX[labels, :].astype(bool).sum(axis=0).A1 negcnts = self.sentX[~labels, :].astype(bool).sum(axis=0).A1 scores = scorer.get_scores(poscnts, negcnts) topic_model[term] = [ self.termidxstore.getval(i) for i in np.argsort(-scores)[:num_terms_per_topic] ] return topic_model
def get_category_association(self, ranker=None, scorer=None): if scorer is None: scorer = RankDifference() if ranker is None: ranker = AbsoluteFrequencyRanker(self.corpus) if self.use_metadata: ranker = ranker.use_non_text_features() term_freq_df = ranker.get_ranks('') global_freq = term_freq_df.sum(axis=1) data = [] for cat in self.corpus.get_categories(): cat_freq = term_freq_df[cat] for term_rank, (term, score) in enumerate(scorer.get_scores( cat_freq, global_freq - cat_freq ).sort_values(ascending=False).iteritems()): data.append({'Category': cat, 'Term': term, 'Rank': term_rank, 'Score': score}) return pd.DataFrame(data).groupby('Rank')
def __init__(self, term_doc_matrix, category_to_timestep_func, is_gap_between_sequences_func, timesteps_to_lag=4, num_top_terms_each_timestep=10, num_terms_to_include=40, starting_time_step=None, term_ranker=AbsoluteFrequencyRanker, term_scorer=RankDifference()): ''' Parameters ---------- term_doc_matrix : TermDocMatrix category_to_timestep_func : lambda is_gap_between_sequences_func : lambda timesteps_to_lag : int num_top_terms_each_timestep : int num_terms_to_include : int starting_time_step : object term_ranker : TermRanker term_scorer : TermScorer ''' self.corpus = term_doc_matrix self.timesteps_to_lag = timesteps_to_lag self.num_top_terms_each_timestep = num_top_terms_each_timestep self.num_terms_to_include = num_terms_to_include self.is_gap_between_sequences_func = is_gap_between_sequences_func self.category_to_timestep_func = category_to_timestep_func self.term_ranker = term_ranker self.term_scorer = term_scorer categories = list(sorted(self.corpus.get_categories())) if len(categories) <= timesteps_to_lag: raise Exception( "The number of categories in the term doc matrix is <= " + str(timesteps_to_lag)) if starting_time_step is None: starting_time_step = categories[timesteps_to_lag + 1] self.starting_time_step = starting_time_step
def produce_pairplot(corpus, asian_mode=False, category_width_in_pixels=500, category_height_in_pixels=700, term_width_in_pixels=500, term_height_in_pixels=700, terms_to_show=3000, scaler=scale_neg_1_to_1_with_zero_mean, term_ranker=AbsoluteFrequencyRanker, use_metadata=False, category_projector=CategoryProjector(), category_projection=None, topic_model_term_lists=None, topic_model_preview_size=10, metadata_descriptions=None, initial_category=None, x_dim=0, y_dim=1, show_halo=True, num_terms_in_halo=5, category_color_func='(function(x) {return "#5555FF"})', protocol='https', d3_url_struct=D3URLs(), **kwargs): if category_projection is None: if use_metadata: category_projection = category_projector.project_with_metadata( corpus, x_dim=x_dim, y_dim=y_dim) term_projection = category_projector else: category_projection = category_projector.project(corpus, x_dim=x_dim, y_dim=y_dim) if initial_category is None: initial_category = corpus.get_categories()[0] category_scatter_chart_explorer = ScatterChartExplorer( category_projection.category_corpus, minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, filter_unigrams=False, jitter=0, max_terms=None, term_ranker=term_ranker, use_non_text_features=True, term_significance=None, terms_to_include=None) proj_df = category_projection.get_pandas_projection() category_scatter_chart_explorer.inject_coordinates( x_coords=scaler(proj_df['x']), y_coords=scaler(proj_df['y']), original_x=proj_df['x'], original_y=proj_df['y']) category_scatter_chart_data = category_scatter_chart_explorer.to_dict( category=initial_category, max_docs_per_category=0, ) category_tooltip_func = '(function(d) {return d.term})' category_scatterplot_structure = ScatterplotStructure( VizDataAdapter(category_scatter_chart_data), width_in_pixels=category_width_in_pixels, height_in_pixels=category_height_in_pixels, asian_mode=asian_mode, use_non_text_features=True, show_top_terms=False, show_characteristic=False, get_tooltip_content=category_tooltip_func, color_func=category_color_func, show_axes=False, unified_context=True, show_category_headings=False, show_cross_axes=True, horizontal_line_y_position=0, vertical_line_x_position=0, y_label='', x_label='', full_data='getCategoryDataAndInfo()', alternative_term_func= '(function (termInfo) {termPlotInterface.drawCategoryAssociation(termInfo.i); return false;})', div_name='cat-plot') compacted_corpus = AssociationCompactor(terms_to_show).compact(corpus) terms_to_hide = set(corpus.get_terms()) - set(compacted_corpus.get_terms()) print('num terms to hide', len(terms_to_hide)) print('num terms to show', compacted_corpus.get_num_terms()) term_scatter_chart_explorer = ScatterChartExplorer( corpus, minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, term_ranker=term_ranker, use_non_text_features=use_metadata, score_transform=stretch_0_to_1, ).hide_terms(terms_to_hide) if topic_model_term_lists is not None: term_scatter_chart_explorer.inject_metadata_term_lists( topic_model_term_lists) if metadata_descriptions is not None: term_scatter_chart_explorer.inject_metadata_descriptions( metadata_descriptions) if use_metadata: tdf = corpus.get_metadata_freq_df('') else: tdf = corpus.get_term_freq_df('') scores = RankDifference().get_scores( tdf[initial_category], tdf[[c for c in corpus.get_categories() if c != initial_category]].sum(axis=1)) term_scatter_chart_data = term_scatter_chart_explorer.to_dict( category=initial_category, scores=scores, include_term_category_counts=True, transform=dense_rank, **kwargs) term_scatterplot_structure = ScatterplotStructure( VizDataAdapter(term_scatter_chart_data), width_in_pixels=term_width_in_pixels, height_in_pixels=term_height_in_pixels, asian_mode=asian_mode, use_non_text_features=use_metadata, show_top_terms=True, show_characteristic=False, get_tooltip_content=None, show_category_headings=False, use_full_doc=use_metadata, horizontal_line_y_position=0, vertical_line_x_position=0, topic_model_preview_size=topic_model_preview_size, y_label=initial_category, x_label='Not ' + initial_category, full_data='getTermDataAndInfo()', div_name='d3-div-1', ) return PairPlotFromScatterplotStructure(category_scatterplot_structure, term_scatterplot_structure, category_projection, category_width_in_pixels, category_height_in_pixels, num_terms=num_terms_in_halo, show_halo=show_halo, d3_url_struct=d3_url_struct, x_dim=x_dim, y_dim=y_dim, protocol=protocol).to_html()
def produce_pairplot( corpus, asian_mode=False, category_width_in_pixels=500, category_height_in_pixels=700, term_width_in_pixels=500, term_height_in_pixels=700, terms_to_show=3000, scaler=scale_neg_1_to_1_with_zero_mean, term_ranker=AbsoluteFrequencyRanker, use_metadata=False, category_projector=CategoryProjector(), category_projection=None, topic_model_term_lists=None, topic_model_preview_size=10, metadata_descriptions=None, initial_category=None, x_dim=0, y_dim=1, show_halo=True, num_terms_in_halo=5, category_color_func='(function(x) {return "#5555FF"})', protocol='https', d3_url_struct=D3URLs(), category_focused=False, verbose=False, use_full_doc=True, default_to_term_comparison=True, category_x_label='', category_y_label='', category_show_axes_and_cross_hairs=False, highlight_selected_category=True, term_x_label=None, # used if default_to_term_comparison term_y_label=None, # used if default_to_term_comparison wordfish_style=False, **kwargs): if category_projection is None: if use_metadata: category_projection = category_projector.project_with_metadata( corpus, x_dim=x_dim, y_dim=y_dim) else: category_projection = category_projector.project(corpus, x_dim=x_dim, y_dim=y_dim) if initial_category is None: initial_category = corpus.get_categories()[0] category_scatter_chart_explorer = _get_category_scatter_chart_explorer( category_projection, scaler, term_ranker, verbose) category_scatter_chart_data = category_scatter_chart_explorer.to_dict( category=initial_category, max_docs_per_category=0, ) category_tooltip_func = '(function(d) {return d.term})' initial_category_idx = corpus.get_categories().index(initial_category) term_plot_change_func = _get_term_plot_change_js_func( wordfish_style, category_focused, initial_category_idx) category_scatterplot_structure = ScatterplotStructure( VizDataAdapter(category_scatter_chart_data), width_in_pixels=category_width_in_pixels, height_in_pixels=category_height_in_pixels, asian_mode=asian_mode, use_non_text_features=True, show_characteristic=False, x_label=category_x_label, y_label=category_y_label, show_axes_and_cross_hairs=category_show_axes_and_cross_hairs, full_data='getCategoryDataAndInfo()', show_top_terms=False, get_tooltip_content=category_tooltip_func, color_func=category_color_func, show_axes=False, horizontal_line_y_position=0, vertical_line_x_position=0, unified_context=True, show_category_headings=False, show_cross_axes=True, div_name='cat-plot', alternative_term_func=term_plot_change_func, highlight_selected_category=highlight_selected_category) compacted_corpus = AssociationCompactor( terms_to_show, use_non_text_features=use_metadata).compact(corpus) terms_to_hide = set(corpus.get_terms()) - set(compacted_corpus.get_terms()) if verbose: print('num terms to hide', len(terms_to_hide)) print('num terms to show', compacted_corpus.get_num_terms()) term_scatter_chart_explorer = ScatterChartExplorer( category_projection.get_corpus(), minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, term_ranker=term_ranker, use_non_text_features=use_metadata, score_transform=stretch_0_to_1, verbose=verbose).hide_terms(terms_to_hide) if default_to_term_comparison: if topic_model_term_lists is not None: term_scatter_chart_explorer.inject_metadata_term_lists( topic_model_term_lists) if metadata_descriptions is not None: term_scatter_chart_explorer.inject_metadata_descriptions( metadata_descriptions) if use_metadata: tdf = corpus.get_metadata_freq_df('') else: tdf = corpus.get_term_freq_df('') scores = RankDifference().get_scores( tdf[initial_category], tdf[[c for c in corpus.get_categories() if c != initial_category]].sum(axis=1)) term_scatter_chart_data = term_scatter_chart_explorer.to_dict( category=initial_category, scores=scores, include_term_category_counts=True, transform=dense_rank, **kwargs) y_label = initial_category, x_label = 'Not ' + initial_category, color_func = None show_top_terms = True show_axes = False else: term_projection = category_projection.get_term_projection() original_x = term_projection['x'] original_y = term_projection['y'] x_coords = scaler(term_projection['x']) y_coords = scaler(term_projection['y']) x_label = term_x_label if term_x_label is not None else '' y_label = term_y_label if term_y_label is not None else '' show_axes = True horizontal_line_y_position = 0 vertical_line_x_position = 0 term_scatter_chart_explorer.inject_coordinates(x_coords, y_coords, original_x=original_x, original_y=original_y) if topic_model_term_lists is not None: term_scatter_chart_explorer.inject_metadata_term_lists( topic_model_term_lists) if metadata_descriptions is not None: term_scatter_chart_explorer.inject_metadata_descriptions( metadata_descriptions) term_scatter_chart_data = term_scatter_chart_explorer.to_dict( category=initial_category, category_name=initial_category, include_term_category_counts=True, # transform=dense_rank, ) color_func = '(function(x) {return "#5555FF"})' show_top_terms = False term_scatterplot_structure = ScatterplotStructure( VizDataAdapter(term_scatter_chart_data), width_in_pixels=term_width_in_pixels, height_in_pixels=term_height_in_pixels, use_full_doc=use_metadata or use_full_doc, asian_mode=asian_mode, use_non_text_features=use_metadata, show_characteristic=False, x_label=x_label, y_label=y_label, full_data='getTermDataAndInfo()', show_top_terms=show_top_terms, get_tooltip_content=None, color_func=color_func, # horizontal_line_y_position=0, # vertical_line_x_position=0, show_axes=show_axes, topic_model_preview_size=topic_model_preview_size, show_category_headings=False, div_name='d3-div-1', unified_context=True, highlight_selected_category=highlight_selected_category) return PairPlotFromScatterplotStructure(category_scatterplot_structure, term_scatterplot_structure, category_projection, category_width_in_pixels, category_height_in_pixels, num_terms=num_terms_in_halo, show_halo=show_halo, d3_url_struct=d3_url_struct, x_dim=x_dim, y_dim=y_dim, protocol=protocol).to_html()
from scattertext.Scalers import dense_rank from scattertext.termscoring.RankDifference import RankDifference from scattertext.termcompaction.AssociationCompactor import AssociationCompactor from scattertext import SampleCorpora, whitespace_nlp_with_sentences, produce_scattertext_explorer from scattertext.CorpusFromPandas import CorpusFromPandas convention_df = SampleCorpora.ConventionData2012.get_data() corpus = CorpusFromPandas( convention_df, category_col='party', text_col='text', nlp=whitespace_nlp_with_sentences).build().get_unigram_corpus().compact( AssociationCompactor(4000)) html = produce_scattertext_explorer(corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, metadata=convention_df['speaker'], term_scorer=RankDifference(), transform=dense_rank) open('./demo_dense_rank.html', 'wb').write(html.encode('utf-8')) print('Open ./demo_dense_rank.html in Chrome or Firefox.')
class SemioticSquare(SemioticSquareBase): ''' Create a visualization of a semiotic square. Requires Corpus to have at least three categories. >>> newsgroups_train = fetch_20newsgroups(subset='train', ... remove=('headers', 'footers', 'quotes')) >>> vectorizer = CountVectorizer() >>> X = vectorizer.fit_transform(newsgroups_train.data) >>> corpus = st.CorpusFromScikit( ... X=X, ... y=newsgroups_train.target, ... feature_vocabulary=vectorizer.vocabulary_, ... category_names=newsgroups_train.target_names, ... raw_texts=newsgroups_train.data ... ).build() >>> semseq = SemioticSquare(corpus, ... category_a = 'alt.atheism', ... category_b = 'soc.religion.christian', ... neutral_categories = ['talk.religion.misc'] ... ) >>> # A simple HTML table >>> html = SemioticSquareViz(semseq).to_html() >>> # The table with an interactive scatterplot below it >>> html = st.produce_semiotic_square_explorer(semiotic_square, ... x_label='More Atheism, Less Xtnity', ... y_label='General Religious Talk') ''' def __init__(self, term_doc_matrix, category_a, category_b, neutral_categories, labels=None, term_ranker=AbsoluteFrequencyRanker, scorer=None): ''' Parameters ---------- term_doc_matrix : TermDocMatrix TermDocMatrix (or descendant) which will be used in constructing square. category_a : str Category name for term A category_b : str Category name for term B (in opposition to A) neutral_categories : list[str] List of category names that A and B will be contrasted to. Should be in same domain. labels : dict None by default. Labels are dictionary of {'a_and_b': 'A and B', ...} to be shown above each category. term_ranker : TermRanker Class for returning a term-frequency convention_df scorer : termscoring class, optional Term scoring class for lexicon mining. Default: `scattertext.termscoring.ScaledFScore` ''' assert category_a in term_doc_matrix.get_categories() assert category_b in term_doc_matrix.get_categories() for category in neutral_categories: assert category in term_doc_matrix.get_categories() if len(neutral_categories) == 0: raise EmptyNeutralCategoriesError() self.category_a_ = category_a self.category_b_ = category_b self.neutral_categories_ = neutral_categories self._build_square(term_doc_matrix, term_ranker, labels, scorer) def _build_square(self, term_doc_matrix, term_ranker, labels, scorer): self.term_doc_matrix_ = term_doc_matrix self.term_ranker = term_ranker(term_doc_matrix) self.scorer = RankDifference() \ if scorer is None else scorer self.axes = self._build_axes(scorer) self.lexicons = self._build_lexicons() self._labels = labels def get_axes(self, scorer=None): ''' Returns ------- pd.DataFrame ''' if scorer: return self._build_axes(scorer) return self.axes def get_lexicons(self, num_terms=10): ''' Parameters ---------- num_terms, int Returns ------- dict ''' return {k: v.index[:num_terms] for k, v in self.lexicons.items()} def get_labels(self): a = self._get_default_a_label() b = self._get_default_b_label() default_labels = {'a': a, 'not_a': 'Not ' + a, 'b': b, 'not_b': 'Not ' + b, 'a_and_b': a + ' + ' + b, 'not_a_and_not_b': 'Not ' + a + ' + Not ' + b, 'a_and_not_b': a + ' + Not ' + b, 'b_and_not_a': 'Not ' + a + ' + ' + b} labels = self._labels if labels is None: labels = {} return {name + '_label': labels.get(name, default_labels[name]) for name in default_labels} def _get_default_b_label(self): return self.category_b_ def _get_default_a_label(self): return self.category_a_ def _build_axes(self, scorer): if scorer is None: scorer = self.scorer tdf = self._get_term_doc_count_df() counts = tdf.sum(axis=1) tdf['x'] = self._get_x_axis(scorer, tdf) tdf['x'][np.isnan(tdf['x'])] = self.scorer.get_default_score() tdf['y'] = self._get_y_axis(scorer, tdf) tdf['y'][np.isnan(tdf['y'])] = self.scorer.get_default_score() tdf['counts'] = counts return tdf[['x', 'y', 'counts']] def _get_x_axis(self, scorer, tdf): return scorer.get_scores( tdf[self.category_a_ + ' freq'], tdf[self.category_b_ + ' freq'] ) def _get_y_axis(self, scorer, tdf): return scorer.get_scores( tdf[[t + ' freq' for t in [self.category_a_, self.category_b_]]].sum(axis=1), tdf[[t + ' freq' for t in self.neutral_categories_]].sum(axis=1) ) def _get_term_doc_count_df(self): return (self.term_ranker.get_ranks() [[t + ' freq' for t in self._get_all_categories()]]) def _get_all_categories(self): return [self.category_a_, self.category_b_] + self.neutral_categories_ def _build_lexicons(self): self.lexicons = {} ax = self.axes x_max = ax['x'].max() y_max = ax['y'].max() x_min = ax['x'].min() y_min = ax['y'].min() x_baseline = self._get_x_baseline() y_baseline = self._get_y_baseline() def dist(candidates, x_bound, y_bound): return ((x_bound - candidates['x']) ** 2 + (y_bound - candidates['y']) ** 2).sort_values() self.lexicons['a'] = dist(ax[(ax['x'] > x_baseline) & (ax['y'] > y_baseline)], x_max, y_max) self.lexicons['not_a'] = dist(ax[(ax['x'] < x_baseline) & (ax['y'] < y_baseline)], x_min, y_min) self.lexicons['b'] = dist(ax[(ax['x'] < x_baseline) & (ax['y'] > y_baseline)], x_min, y_max) self.lexicons['not_b'] = dist(ax[(ax['x'] > x_baseline) & (ax['y'] < y_baseline)], x_max, y_min) self.lexicons['a_and_b'] = dist(ax[(ax['y'] > y_baseline)], x_baseline, y_max) self.lexicons['not_a_and_not_b'] = dist(ax[(ax['y'] < y_baseline)], x_baseline, y_min) self.lexicons['a_and_not_b'] = dist(ax[(ax['x'] > x_baseline)], x_max, y_baseline) self.lexicons['b_and_not_a'] = dist(ax[(ax['x'] < x_baseline)], x_min, y_baseline) return self.lexicons def _get_y_baseline(self): return self.scorer.get_default_score() def _get_x_baseline(self): return self.scorer.get_default_score()