def test_alternative_text(self): corpus = build_hamlet_jz_corpus_with_alt_text() j = (ScatterChartExplorer(corpus, minimum_term_frequency=0).to_dict( 'hamlet', alternative_text_field='alt')) self.assertEqual(j['docs']['texts'][0], j['docs']['texts'][0].upper()) j = (ScatterChartExplorer(corpus, minimum_term_frequency=0).to_dict('hamlet')) self.assertNotEqual(j['docs']['texts'][0], j['docs']['texts'][0].upper())
def test_multi_categories(self): corpus = get_test_corpus() j_vs_all = ScatterChartExplorer(corpus=corpus, minimum_term_frequency=0) \ .to_dict('hamlet') j_vs_swift = ScatterChartExplorer(corpus=corpus, minimum_term_frequency=0) \ .to_dict('hamlet', not_categories=['swift']) self.assertNotEqual( set(j_vs_all['info']['not_category_internal_names']), set(j_vs_swift['info']['not_category_internal_names'])) self.assertEqual(list(j_vs_all['docs']['labels']), list(j_vs_swift['docs']['labels'])) self.assertEqual(list(j_vs_all['docs']['categories']), list(j_vs_swift['docs']['categories']))
def test_metadata(self): corpus = build_hamlet_jz_corpus() meta = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight'] j = (ScatterChartExplorer(corpus, minimum_term_frequency=0).to_dict( 'hamlet', metadata=meta)) self.maxDiff = None j['docs']['labels'] = list(j['docs']['labels']) self.assertEqual( j['docs'], { 'labels': [0, 0, 0, 0, 1, 1, 1, 1], 'categories': ['hamlet', 'jay-z/r. kelly'], 'meta': [ 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight' ], 'texts': [ "what art thou that usurp'st this time of night,", 'together with that fair and warlike form', 'in which the majesty of buried denmark', 'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?', 'it is i sire tone from brooklyn.', 'well, speak up man what is it?', 'news from the east sire! the best of both worlds has returned!' ] })
def test_to_dict(self): np.random.seed(0) random.seed(0) corpus = build_hamlet_jz_corpus() j = (ScatterChartExplorer(corpus, minimum_term_frequency=0).to_dict('hamlet')) self.assertEqual(set(j.keys()), set(['info', 'data', 'docs'])) self.assertEqual( set(j['info'].keys()), set([ 'not_category_name', 'category_name', 'category_terms', 'not_category_internal_names', 'not_category_terms', 'category_internal_name', 'categories', 'neutral_category_name', 'extra_category_name', 'neutral_category_internal_names', 'extra_category_internal_names' ])) self.assertEqual(list(j['docs']['labels']), [0, 0, 0, 0, 1, 1, 1, 1]) self.assertEqual(list(j['docs']['texts']), [ "what art thou that usurp'st this time of night,", 'together with that fair and warlike form', 'in which the majesty of buried denmark', 'did sometimes march? by heaven i charge thee, speak!', 'halt! who goes there?', 'it is i sire tone from brooklyn.', 'well, speak up man what is it?', 'news from the east sire! the best of both worlds has returned!' ]) expected = { 'y': 0.5, 'ncat': 0, 'ncat25k': 0, 'bg': 5, 'cat': 1, 's': 0.5, 'term': 'art', 'os': 0.5192, 'extra': 0, 'extra25k': 0, 'cat25k': 758, 'x': 0.06, 'neut': 0, 'neut25k': 0, 'ox': 5, 'oy': 3 } actual = [t for t in j['data'] if t['term'] == 'art'][0] ''' for var in expected.keys(): try: #np.testing.assert_almost_equal(actual[var], expected[var],decimal=1) except TypeError: self.assertEqual(actual[var], expected[var]) ''' self.assertEqual(set(expected.keys()), set(actual.keys())) self.assertEqual(expected['term'], actual['term']) self.assertEqual(j['docs'].keys(), {'texts', 'labels', 'categories'})
def test_include_term_category_counts(self): corpus = build_hamlet_jz_corpus().get_unigram_corpus() j = (ScatterChartExplorer(corpus, minimum_term_frequency=0).to_dict( 'hamlet', include_term_category_counts=True)) self.assertEqual(set(j.keys()), set(['info', 'data', 'docs', 'termCounts'])) self.assertEqual(len(j['termCounts']), corpus.get_num_categories()) term_idx_set = set() for cat_counts in j['termCounts']: term_idx_set |= set(cat_counts.keys()) self.assertTrue( all([freq >= docs for freq, docs in cat_counts.values()])) self.assertEqual(len(term_idx_set), corpus.get_num_terms())
def _get_category_scatter_chart_explorer(category_projection, scaler, term_ranker, verbose): category_scatter_chart_explorer = ScatterChartExplorer( category_projection.get_corpus(), minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, filter_unigrams=False, jitter=0, max_terms=None, # term_ranker=term_ranker, use_non_text_features=True, term_significance=None, terms_to_include=None, verbose=verbose) proj_df = category_projection.get_pandas_projection() category_scatter_chart_explorer.inject_coordinates( x_coords=scaler(proj_df['x']), y_coords=scaler(proj_df['y']), original_x=proj_df['x'], original_y=proj_df['y']) return category_scatter_chart_explorer
def test_hide_terms(self): corpus = build_hamlet_jz_corpus().get_unigram_corpus() terms_to_hide = ['thou', 'heaven'] sc = (ScatterChartExplorer( corpus, minimum_term_frequency=0).hide_terms(terms_to_hide)) self.assertEquals(type(sc), ScatterChartExplorer) j = sc.to_dict('hamlet', include_term_category_counts=True) self.assertTrue( all([ 'display' in t and t['display'] == False for t in j['data'] if t['term'] in terms_to_hide ])) self.assertTrue( all([ 'display' not in t for t in j['data'] if t['term'] not in terms_to_hide ]))
def produce_pairplot(corpus, asian_mode=False, category_width_in_pixels=500, category_height_in_pixels=700, term_width_in_pixels=500, term_height_in_pixels=700, terms_to_show=3000, scaler=scale_neg_1_to_1_with_zero_mean, term_ranker=AbsoluteFrequencyRanker, use_metadata=False, category_projector=CategoryProjector(), category_projection=None, topic_model_term_lists=None, topic_model_preview_size=10, metadata_descriptions=None, initial_category=None, x_dim=0, y_dim=1, show_halo=True, num_terms_in_halo=5, category_color_func='(function(x) {return "#5555FF"})', protocol='https', d3_url_struct=D3URLs(), **kwargs): if category_projection is None: if use_metadata: category_projection = category_projector.project_with_metadata( corpus, x_dim=x_dim, y_dim=y_dim) term_projection = category_projector else: category_projection = category_projector.project(corpus, x_dim=x_dim, y_dim=y_dim) if initial_category is None: initial_category = corpus.get_categories()[0] category_scatter_chart_explorer = ScatterChartExplorer( category_projection.category_corpus, minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, filter_unigrams=False, jitter=0, max_terms=None, term_ranker=term_ranker, use_non_text_features=True, term_significance=None, terms_to_include=None) proj_df = category_projection.get_pandas_projection() category_scatter_chart_explorer.inject_coordinates( x_coords=scaler(proj_df['x']), y_coords=scaler(proj_df['y']), original_x=proj_df['x'], original_y=proj_df['y']) category_scatter_chart_data = category_scatter_chart_explorer.to_dict( category=initial_category, max_docs_per_category=0, ) category_tooltip_func = '(function(d) {return d.term})' category_scatterplot_structure = ScatterplotStructure( VizDataAdapter(category_scatter_chart_data), width_in_pixels=category_width_in_pixels, height_in_pixels=category_height_in_pixels, asian_mode=asian_mode, use_non_text_features=True, show_top_terms=False, show_characteristic=False, get_tooltip_content=category_tooltip_func, color_func=category_color_func, show_axes=False, unified_context=True, show_category_headings=False, show_cross_axes=True, horizontal_line_y_position=0, vertical_line_x_position=0, y_label='', x_label='', full_data='getCategoryDataAndInfo()', alternative_term_func= '(function (termInfo) {termPlotInterface.drawCategoryAssociation(termInfo.i); return false;})', div_name='cat-plot') compacted_corpus = AssociationCompactor(terms_to_show).compact(corpus) terms_to_hide = set(corpus.get_terms()) - set(compacted_corpus.get_terms()) print('num terms to hide', len(terms_to_hide)) print('num terms to show', compacted_corpus.get_num_terms()) term_scatter_chart_explorer = ScatterChartExplorer( corpus, minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, term_ranker=term_ranker, use_non_text_features=use_metadata, score_transform=stretch_0_to_1, ).hide_terms(terms_to_hide) if topic_model_term_lists is not None: term_scatter_chart_explorer.inject_metadata_term_lists( topic_model_term_lists) if metadata_descriptions is not None: term_scatter_chart_explorer.inject_metadata_descriptions( metadata_descriptions) if use_metadata: tdf = corpus.get_metadata_freq_df('') else: tdf = corpus.get_term_freq_df('') scores = RankDifference().get_scores( tdf[initial_category], tdf[[c for c in corpus.get_categories() if c != initial_category]].sum(axis=1)) term_scatter_chart_data = term_scatter_chart_explorer.to_dict( category=initial_category, scores=scores, include_term_category_counts=True, transform=dense_rank, **kwargs) term_scatterplot_structure = ScatterplotStructure( VizDataAdapter(term_scatter_chart_data), width_in_pixels=term_width_in_pixels, height_in_pixels=term_height_in_pixels, asian_mode=asian_mode, use_non_text_features=use_metadata, show_top_terms=True, show_characteristic=False, get_tooltip_content=None, show_category_headings=False, use_full_doc=use_metadata, horizontal_line_y_position=0, vertical_line_x_position=0, topic_model_preview_size=topic_model_preview_size, y_label=initial_category, x_label='Not ' + initial_category, full_data='getTermDataAndInfo()', div_name='d3-div-1', ) return PairPlotFromScatterplotStructure(category_scatterplot_structure, term_scatterplot_structure, category_projection, category_width_in_pixels, category_height_in_pixels, num_terms=num_terms_in_halo, show_halo=show_halo, d3_url_struct=d3_url_struct, x_dim=x_dim, y_dim=y_dim, protocol=protocol).to_html()
def produce_scattertext_explorer( corpus, category, category_name=None, not_category_name=None, protocol='https', pmi_threshold_coefficient=DEFAULT_MINIMUM_TERM_FREQUENCY, minimum_term_frequency=DEFAULT_PMI_THRESHOLD_COEFFICIENT, minimum_not_category_term_frequency=0, max_terms=None, filter_unigrams=False, height_in_pixels=None, width_in_pixels=None, max_snippets=None, max_docs_per_category=None, metadata=None, scores=None, x_coords=None, y_coords=None, original_x=None, original_y=None, rescale_x=None, rescale_y=None, singleScoreMode=False, sort_by_dist=True, reverse_sort_scores_for_not_category=True, use_full_doc=False, transform=percentile_alphabetical, jitter=0, gray_zero_scores=False, term_ranker=None, asian_mode=False, use_non_text_features=False, show_top_terms=True, show_characteristic=True, word_vec_use_p_vals=False, max_p_val=0.1, p_value_colors=False, term_significance=None, save_svg_button=False, x_label=None, y_label=None, d3_url=None, d3_scale_chromatic_url=None, pmi_filter_thresold=None, alternative_text_field=None, terms_to_include=None, semiotic_square=None, num_terms_semiotic_square=None, not_categories=None, show_neutral=False, neutral_category_name=None, get_tooltip_content=None, x_axis_values=None, y_axis_values=None, color_func=None, term_scorer=None, show_axes=True): '''Returns html code of visualization. Parameters ---------- corpus : Corpus Corpus to use. category : str Name of category column as it appears in original data frame. category_name : str Name of category to use. E.g., "5-star reviews." Optional, defaults to category name. not_category_name : str Name of everything that isn't in category. E.g., "Below 5-star reviews". Optional defaults to "N(n)ot " + category_name, with the case of the 'n' dependent on the case of the first letter in category_name. protocol : str, optional Protocol to use. Either http or https. Default is https. pmi_threshold_coefficient : int, optional Filter out bigrams with a PMI of < 2 * pmi_threshold_coefficient. Default is 6 minimum_term_frequency : int, optional Minimum number of times word needs to appear to make it into visualization. minimum_not_category_term_frequency : int, optional If an n-gram does not occur in the category, minimum times it must been seen to be included. Default is 0. max_terms : int, optional Maximum number of terms to include in visualization. filter_unigrams : bool, optional Default False, do we filter out unigrams that only occur in one bigram width_in_pixels : int, optional Width of viz in pixels, if None, default to JS's choice height_in_pixels : int, optional Height of viz in pixels, if None, default to JS's choice max_snippets : int, optional Maximum number of snippets to show when term is clicked. If None, all are shown. max_docs_per_category: int, optional Maximum number of documents to store per category. If None, by default, all are stored. metadata : list, optional list of meta data strings that will be included for each document scores : np.array, optional Array of term scores or None. x_coords : np.array, optional Array of term x-axis positions or None. Must be in [0,1]. If present, y_coords must also be present. y_coords : np.array, optional Array of term y-axis positions or None. Must be in [0,1]. If present, x_coords must also be present. original_x : array-like Original, unscaled x-values. Defaults to x_coords original_y : array-like Original, unscaled y-values. Defaults to y_coords rescale_x : lambda list[0,1]: list[0,1], optional Array of term x-axis positions or None. Must be in [0,1]. Rescales x-axis after filtering rescale_y : lambda list[0,1]: list[0,1], optional Array of term y-axis positions or None. Must be in [0,1]. Rescales y-axis after filtering singleScoreMode : bool, optional Label terms based on score vs distance from corner. Good for topic scores. Show only one color. sort_by_dist: bool, optional Label terms based distance from corner. True by default. Negated by singleScoreMode. reverse_sort_scores_for_not_category: bool, optional If using a custom score, score the not-category class by lowest-score-as-most-predictive. Turn this off for word vector or topic similarity. Default True. use_full_doc : bool, optional Use the full document in snippets. False by default. transform : function, optional not recommended for editing. change the way terms are ranked. default is st.Scalers.percentile_ordinal jitter : float, optional percentage of axis to jitter each point. default is 0. gray_zero_scores : bool, optional If True, color points with zero-scores a light shade of grey. False by default. term_ranker : TermRanker, optional TermRanker class for determining term frequency ranks. asian_mode : bool, optional Use a special Javascript regular expression that's specific to chinese or japanese use_non_text_features : bool, optional Show non-bag-of-words features (e.g., Empath) instead of text. False by default. show_top_terms : bool, default True Show top terms on the left-hand side of the visualization show_characteristic: bool, default True Show characteristic terms on the far left-hand side of the visualization word_vec_use_p_vals: bool, default False Sort by harmonic mean of score and distance. max_p_val : float, default 0.1 If word_vec_use_p_vals, the minimum p val to use. p_value_colors : bool, default False Color points differently if p val is above 1-max_p_val, below max_p_val, or in between. term_significance : TermSignificance instance or None Way of getting signfiance scores. If None, p values will not be added. save_svg_button : bool, default False Add a save as SVG button to the page. x_label : str, default None Custom x-axis label y_label : str, default None Custom y-axis label d3_url, str, None by default. The url (or path) of d3. URL of d3, to be inserted into <script src="..."/>. Overrides `protocol`. By default, this is `DEFAULT_D3_URL` declared in `HTMLVisualizationAssembly`. d3_scale_chromatic_url, str, None by default. Overrides `protocol`. URL of d3 scale chromatic, to be inserted into <script src="..."/> By default, this is `DEFAULT_D3_SCALE_CHROMATIC` declared in `HTMLVisualizationAssembly`. pmi_filter_thresold : (DEPRECATED) int, None by default DEPRECATED. Use pmi_threshold_coefficient instead. alternative_text_field : str or None, optional Field in from dataframe used to make corpus to display in place of parsed text. Only can be used if corpus is a ParsedCorpus instance. terms_to_include : list or None, optional Whitelist of terms to include in visualization. semiotic_square : SemioticSquare None by default. SemioticSquare based on corpus. Includes square above visualization. num_terms_semiotic_square : int 10 by default. Number of terms to show in semiotic square. Only active if semiotic square is present. not_categories : list All categories other than category by default. Documents labeled with remaining category. show_neutral : bool False by default. Show a third column listing contexts in the neutral categories. neutral_category_name : str "Neutral" by default. Only active if show_neutral is True. Name of the neutral column. get_tooltip_content : str Javascript function to control content of tooltip. Function takes a parameter which is a dictionary entry produced by `ScatterChartExplorer.to_dict` and returns a string. x_axis_values : list, default None Value-labels to show on x-axis. Low, medium, high are defaults. y_axis_values : list, default None Value-labels to show on y-axis. Low, medium, high are defaults. color_func : str, default None Javascript function to control color of a point. Function takes a parameter which is a dictionary entry produced by `ScatterChartExplorer.to_dict` and returns a string. term_scorer : Object, default None In lieu of scores, object with a get_scores(a,b) function that returns a set of scores, where a and b are term counts. Scorer optionally has a get_term_freqs function. show_axes : bool, default True Show the ticked axes on the plot. If false, show inner axes as a crosshair. Returns ------- str html of visualization ''' color = None if singleScoreMode or word_vec_use_p_vals: color = 'd3.interpolatePurples' if singleScoreMode or not sort_by_dist: sort_by_dist = False else: sort_by_dist = True if term_ranker is None: term_ranker = termranking.AbsoluteFrequencyRanker if category_name is None: category_name = category if not_category_name is None: if not_categories is not None and len(not_categories) == 1: not_category_name = not_categories[0] else: not_category_name = ('Not' if category_name[0].isupper() else 'not') + ' ' + category_name if term_scorer: tdf = term_ranker(corpus).get_ranks() cat_freqs = tdf[category + ' freq'] if not_categories: not_cat_freqs = tdf[[c + ' freq' for c in not_categories]].sum(axis=1) else: not_cat_freqs = tdf.sum(axis=1) - tdf[category] scores = term_scorer.get_scores(cat_freqs, not_cat_freqs) if pmi_filter_thresold is not None: pmi_threshold_coefficient = pmi_filter_thresold warnings.warn( "The argument name 'pmi_filter_thresold' has been deprecated. Use 'pmi_threshold_coefficient' in its place", DeprecationWarning) scatter_chart_explorer = ScatterChartExplorer( corpus, minimum_term_frequency=minimum_term_frequency, minimum_not_category_term_frequency=minimum_not_category_term_frequency, pmi_threshold_coefficient=pmi_threshold_coefficient, filter_unigrams=filter_unigrams, jitter=jitter, max_terms=max_terms, term_ranker=term_ranker, use_non_text_features=use_non_text_features, term_significance=term_significance, terms_to_include=terms_to_include) if ((x_coords is None and y_coords is not None) or (y_coords is None and x_coords is not None)): raise Exception( "Both x_coords and y_coords need to be passed or both left blank") if x_coords is not None: scatter_chart_explorer.inject_coordinates(x_coords, y_coords, rescale_x=rescale_x, rescale_y=rescale_y, original_x=original_x, original_y=original_y) html_base = None if semiotic_square: html_base = get_semiotic_square_html(num_terms_semiotic_square, semiotic_square) scatter_chart_data = scatter_chart_explorer.to_dict( category=category, category_name=category_name, not_category_name=not_category_name, not_categories=not_categories, transform=transform, scores=scores, max_docs_per_category=max_docs_per_category, metadata=metadata, alternative_text_field=alternative_text_field, neutral_category_name=neutral_category_name) return HTMLVisualizationAssembly(VizDataAdapter(scatter_chart_data), width_in_pixels=width_in_pixels, height_in_pixels=height_in_pixels, max_snippets=max_snippets, color=color, grey_zero_scores=gray_zero_scores, sort_by_dist=sort_by_dist, reverse_sort_scores_for_not_category=reverse_sort_scores_for_not_category, use_full_doc=use_full_doc, asian_mode=asian_mode, use_non_text_features=use_non_text_features, show_characteristic=show_characteristic, show_top_terms=show_top_terms, word_vec_use_p_vals=word_vec_use_p_vals, max_p_val=max_p_val, save_svg_button=save_svg_button, p_value_colors=p_value_colors, x_label=x_label, y_label=y_label, show_neutral=show_neutral, get_tooltip_content=get_tooltip_content, x_axis_values=x_axis_values, y_axis_values=y_axis_values, color_func=color_func, show_axes=show_axes) \ .to_html(protocol=protocol, d3_url=d3_url, d3_scale_chromatic_url=d3_scale_chromatic_url, html_base=html_base)
def produce_pairplot( corpus, asian_mode=False, category_width_in_pixels=500, category_height_in_pixels=700, term_width_in_pixels=500, term_height_in_pixels=700, terms_to_show=3000, scaler=scale_neg_1_to_1_with_zero_mean, term_ranker=AbsoluteFrequencyRanker, use_metadata=False, category_projector=CategoryProjector(), category_projection=None, topic_model_term_lists=None, topic_model_preview_size=10, metadata_descriptions=None, initial_category=None, x_dim=0, y_dim=1, show_halo=True, num_terms_in_halo=5, category_color_func='(function(x) {return "#5555FF"})', protocol='https', d3_url_struct=D3URLs(), category_focused=False, verbose=False, use_full_doc=True, default_to_term_comparison=True, category_x_label='', category_y_label='', category_show_axes_and_cross_hairs=False, highlight_selected_category=True, term_x_label=None, # used if default_to_term_comparison term_y_label=None, # used if default_to_term_comparison wordfish_style=False, **kwargs): if category_projection is None: if use_metadata: category_projection = category_projector.project_with_metadata( corpus, x_dim=x_dim, y_dim=y_dim) else: category_projection = category_projector.project(corpus, x_dim=x_dim, y_dim=y_dim) if initial_category is None: initial_category = corpus.get_categories()[0] category_scatter_chart_explorer = _get_category_scatter_chart_explorer( category_projection, scaler, term_ranker, verbose) category_scatter_chart_data = category_scatter_chart_explorer.to_dict( category=initial_category, max_docs_per_category=0, ) category_tooltip_func = '(function(d) {return d.term})' initial_category_idx = corpus.get_categories().index(initial_category) term_plot_change_func = _get_term_plot_change_js_func( wordfish_style, category_focused, initial_category_idx) category_scatterplot_structure = ScatterplotStructure( VizDataAdapter(category_scatter_chart_data), width_in_pixels=category_width_in_pixels, height_in_pixels=category_height_in_pixels, asian_mode=asian_mode, use_non_text_features=True, show_characteristic=False, x_label=category_x_label, y_label=category_y_label, show_axes_and_cross_hairs=category_show_axes_and_cross_hairs, full_data='getCategoryDataAndInfo()', show_top_terms=False, get_tooltip_content=category_tooltip_func, color_func=category_color_func, show_axes=False, horizontal_line_y_position=0, vertical_line_x_position=0, unified_context=True, show_category_headings=False, show_cross_axes=True, div_name='cat-plot', alternative_term_func=term_plot_change_func, highlight_selected_category=highlight_selected_category) compacted_corpus = AssociationCompactor( terms_to_show, use_non_text_features=use_metadata).compact(corpus) terms_to_hide = set(corpus.get_terms()) - set(compacted_corpus.get_terms()) if verbose: print('num terms to hide', len(terms_to_hide)) print('num terms to show', compacted_corpus.get_num_terms()) term_scatter_chart_explorer = ScatterChartExplorer( category_projection.get_corpus(), minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, term_ranker=term_ranker, use_non_text_features=use_metadata, score_transform=stretch_0_to_1, verbose=verbose).hide_terms(terms_to_hide) if default_to_term_comparison: if topic_model_term_lists is not None: term_scatter_chart_explorer.inject_metadata_term_lists( topic_model_term_lists) if metadata_descriptions is not None: term_scatter_chart_explorer.inject_metadata_descriptions( metadata_descriptions) if use_metadata: tdf = corpus.get_metadata_freq_df('') else: tdf = corpus.get_term_freq_df('') scores = RankDifference().get_scores( tdf[initial_category], tdf[[c for c in corpus.get_categories() if c != initial_category]].sum(axis=1)) term_scatter_chart_data = term_scatter_chart_explorer.to_dict( category=initial_category, scores=scores, include_term_category_counts=True, transform=dense_rank, **kwargs) y_label = initial_category, x_label = 'Not ' + initial_category, color_func = None show_top_terms = True show_axes = False else: term_projection = category_projection.get_term_projection() original_x = term_projection['x'] original_y = term_projection['y'] x_coords = scaler(term_projection['x']) y_coords = scaler(term_projection['y']) x_label = term_x_label if term_x_label is not None else '' y_label = term_y_label if term_y_label is not None else '' show_axes = True horizontal_line_y_position = 0 vertical_line_x_position = 0 term_scatter_chart_explorer.inject_coordinates(x_coords, y_coords, original_x=original_x, original_y=original_y) if topic_model_term_lists is not None: term_scatter_chart_explorer.inject_metadata_term_lists( topic_model_term_lists) if metadata_descriptions is not None: term_scatter_chart_explorer.inject_metadata_descriptions( metadata_descriptions) term_scatter_chart_data = term_scatter_chart_explorer.to_dict( category=initial_category, category_name=initial_category, include_term_category_counts=True, # transform=dense_rank, ) color_func = '(function(x) {return "#5555FF"})' show_top_terms = False term_scatterplot_structure = ScatterplotStructure( VizDataAdapter(term_scatter_chart_data), width_in_pixels=term_width_in_pixels, height_in_pixels=term_height_in_pixels, use_full_doc=use_metadata or use_full_doc, asian_mode=asian_mode, use_non_text_features=use_metadata, show_characteristic=False, x_label=x_label, y_label=y_label, full_data='getTermDataAndInfo()', show_top_terms=show_top_terms, get_tooltip_content=None, color_func=color_func, # horizontal_line_y_position=0, # vertical_line_x_position=0, show_axes=show_axes, topic_model_preview_size=topic_model_preview_size, show_category_headings=False, div_name='d3-div-1', unified_context=True, highlight_selected_category=highlight_selected_category) return PairPlotFromScatterplotStructure(category_scatterplot_structure, term_scatterplot_structure, category_projection, category_width_in_pixels, category_height_in_pixels, num_terms=num_terms_in_halo, show_halo=show_halo, d3_url_struct=d3_url_struct, x_dim=x_dim, y_dim=y_dim, protocol=protocol).to_html()
def produce_scattertext_explorer(corpus, category, category_name, not_category_name, protocol='https', pmi_filter_thresold=2, minimum_term_frequency=3, minimum_not_category_term_frequency=0, max_terms=None, filter_unigrams=False, height_in_pixels=None, width_in_pixels=None, max_snippets=None, max_docs_per_category=None, metadata=None, scores=None, singleScoreMode=False, sort_by_dist=True, reverse_sort_scores_for_not_category=True, use_full_doc=False, transform=percentile_alphabetical, jitter=0, grey_zero_scores=False, term_ranker=None, chinese_mode=False, use_non_text_features=False, show_characteristic=True, word_vec_use_p_vals=False, max_p_val=0.05, p_value_colors=False, term_significance=None, save_svg_button=False): '''Returns html code of visualization. Parameters ---------- corpus : Corpus Corpus to use. category : str Name of category column as it appears in original data frame. category_name : str Name of category to use. E.g., "5-star reviews." not_category_name : str Name of everything that isn't in category. E.g., "Below 5-star reviews". protocol : str, optional Protocol to use. Either http or https. Default is https. minimum_term_frequency : int, optional Minimum number of times word needs to appear to make it into visualization. minimum_not_category_term_frequency : int, optional If an n-gram does not occur in the category, minimum times it must been seen to be included. Default is 0. max_terms : int, optional Maximum number of terms to include in visualization. filter_unigrams : bool, optional Default False, do we filter out unigrams that only occur in one bigram width_in_pixels : int, optional Width of viz in pixels, if None, default to JS's choice height_in_pixels : int, optional Height of viz in pixels, if None, default to JS's choice max_snippets : int, optional Maximum number of snippets to show when term is clicked. If None, all are shown. max_docs_per_category: int, optional Maximum number of documents to store per category. If None, by default, all are stored. metadata : list, optional list of meta data strings that will be included for each document scores : np.array, optional Array of term scores or None. singleScoreMode : bool, optional Label terms based on score vs distance from corner. Good for topic scores. Show only one color. sort_by_dist: bool, optional Label terms based distance from corner. True by default. Negated by singleScoreMode. reverse_sort_scores_for_not_category: bool, optional If using a custom score, score the not-category class by lowest-score-as-most-predictive. Turn this off for word vectory or topic similarity. Default True. use_full_doc : bool, optional Use the full document in snippets. False by default. transform : function, optional not recommended for editing. change the way terms are ranked. default is st.Scalers.percentile_ordinal jitter : float, optional percentage of axis to jitter each point. default is 0. grey_zero_scores : bool, optional If True, color points with zero-scores a light shade of grey. False by default. term_ranker : TermRanker, optional TermRanker class for determining term frequency ranks. chinese_mode : bool, optional Use a special Javascript regular expression that's specific to chinese use_non_text_features : bool, optional Show non-bag-of-words features (e.g., Empath) instaed of text. False by default. show_characteristic: bool, default True Show characteristic terms on the far left-hand side of the visualization word_vec_use_p_vals: bool, default False Sort by harmonic mean of score and distance. max_p_val : float, default 0.05 If word_vec_use_p_vals, the minimum p val to use. p_value_colors : bool, default False Color points differently if p val is above 1-max_p_val, below max_p_val, or in between. p_value_colors : false term_significance : TermSignifiance instance or None Way of getting signfiance scores. If None, p values will not be added. save_svg_button : bool, default False Add a save as SVG button to the page. Returns ------- str, html of visualization ''' color = None if singleScoreMode or word_vec_use_p_vals: color = 'd3.interpolatePurples' if singleScoreMode or not sort_by_dist: sort_by_dist = False else: sort_by_dist = True if term_ranker is None: term_ranker = termranking.AbsoluteFrequencyRanker scatter_chart_explorer = ScatterChartExplorer( corpus, minimum_term_frequency=minimum_term_frequency, minimum_not_category_term_frequency=minimum_not_category_term_frequency, pmi_threshold_coefficient=pmi_filter_thresold, filter_unigrams=filter_unigrams, jitter=jitter, max_terms=max_terms, term_ranker=term_ranker, use_non_text_features=use_non_text_features, term_significance=term_significance) scatter_chart_data = scatter_chart_explorer.to_dict( category=category, category_name=category_name, not_category_name=not_category_name, transform=transform, scores=scores, max_docs_per_category=max_docs_per_category, metadata=metadata) return HTMLVisualizationAssembly(VizDataAdapter(scatter_chart_data), width_in_pixels=width_in_pixels, height_in_pixels=height_in_pixels, max_snippets=max_snippets, color=color, grey_zero_scores=grey_zero_scores, sort_by_dist=sort_by_dist, reverse_sort_scores_for_not_category=reverse_sort_scores_for_not_category, use_full_doc=use_full_doc, chinese_mode=chinese_mode, use_non_text_features=use_non_text_features, show_characteristic=show_characteristic, word_vec_use_p_vals=word_vec_use_p_vals, max_p_val=max_p_val, save_svg_button=save_svg_button, p_value_colors=p_value_colors) \ .to_html(protocol=protocol)
def produce_scattertext_explorer(corpus, category, category_name, not_category_name, protocol='https', pmi_threshold_coefficient=6, minimum_term_frequency=3, minimum_not_category_term_frequency=0, max_terms=None, filter_unigrams=False, height_in_pixels=None, width_in_pixels=None, max_snippets=None, max_docs_per_category=None, metadata=None, scores=None, x_coords=None, y_coords=None, singleScoreMode=False, sort_by_dist=True, reverse_sort_scores_for_not_category=True, use_full_doc=False, transform=percentile_alphabetical, jitter=0, grey_zero_scores=False, term_ranker=None, asian_mode=False, use_non_text_features=False, show_characteristic=True, word_vec_use_p_vals=False, max_p_val=0.1, p_value_colors=False, term_significance=None, save_svg_button=False, x_label=None, y_label=None, d3_url=None, d3_scale_chromatic_url=None, pmi_filter_thresold=None, alternative_text_field=None): '''Returns html code of visualization. Parameters ---------- corpus : Corpus Corpus to use. category : str Name of category column as it appears in original data frame. category_name : str Name of category to use. E.g., "5-star reviews." not_category_name : str Name of everything that isn't in category. E.g., "Below 5-star reviews". protocol : str, optional Protocol to use. Either http or https. Default is https. pmi_threshold_coefficient : int, optional Filter out bigrams with a PMI of < 2 * pmi_threshold_coefficient. Default is 6 minimum_term_frequency : int, optional Minimum number of times word needs to appear to make it into visualization. minimum_not_category_term_frequency : int, optional If an n-gram does not occur in the category, minimum times it must been seen to be included. Default is 0. max_terms : int, optional Maximum number of terms to include in visualization. filter_unigrams : bool, optional Default False, do we filter out unigrams that only occur in one bigram width_in_pixels : int, optional Width of viz in pixels, if None, default to JS's choice height_in_pixels : int, optional Height of viz in pixels, if None, default to JS's choice max_snippets : int, optional Maximum number of snippets to show when term is clicked. If None, all are shown. max_docs_per_category: int, optional Maximum number of documents to store per category. If None, by default, all are stored. metadata : list, optional list of meta data strings that will be included for each document scores : np.array, optional Array of term scores or None. x_coords : np.array, optional Array of term x-axis positions or None. Must be in [0,1]. If present, y_coords must also be present. y_coords : np.array, optional Array of term y-axis positions or None. Must be in [0,1]. If present, x_coords must also be present. singleScoreMode : bool, optional Label terms based on score vs distance from corner. Good for topic scores. Show only one color. sort_by_dist: bool, optional Label terms based distance from corner. True by default. Negated by singleScoreMode. reverse_sort_scores_for_not_category: bool, optional If using a custom score, score the not-category class by lowest-score-as-most-predictive. Turn this off for word vectory or topic similarity. Default True. use_full_doc : bool, optional Use the full document in snippets. False by default. transform : function, optional not recommended for editing. change the way terms are ranked. default is st.Scalers.percentile_ordinal jitter : float, optional percentage of axis to jitter each point. default is 0. grey_zero_scores : bool, optional If True, color points with zero-scores a light shade of grey. False by default. term_ranker : TermRanker, optional TermRanker class for determining term frequency ranks. asian_mode : bool, optional Use a special Javascript regular expression that's specific to chinese or japanese use_non_text_features : bool, optional Show non-bag-of-words features (e.g., Empath) instaed of text. False by default. show_characteristic: bool, default True Show characteristic terms on the far left-hand side of the visualization word_vec_use_p_vals: bool, default False Sort by harmonic mean of score and distance. max_p_val : float, default 0.1 If word_vec_use_p_vals, the minimum p val to use. p_value_colors : bool, default False Color points differently if p val is above 1-max_p_val, below max_p_val, or in between. term_significance : TermSignifiance instance or None Way of getting signfiance scores. If None, p values will not be added. save_svg_button : bool, default False Add a save as SVG button to the page. x_label : str, default None Custom x-axis label y_label : str, default None Custom y-axis label d3_url, str, None by default. The url (or path) of d3. URL of d3, to be inserted into <script src="..."/>. Overrides `protocol`. By default, this is `DEFAULT_D3_URL` declared in `HTMLVisualizationAssembly`. d3_scale_chromatic_url, str, None by default. Overrides `protocol`. URL of d3 scale chromatic, to be inserted into <script src="..."/> By default, this is `DEFAULT_D3_SCALE_CHROMATIC` declared in `HTMLVisualizationAssembly`. pmi_filter_thresold : (DEPRECATED) int, None by default DEPRECATED. Use pmi_threshold_coefficient instead. alternative_text_field : str or None, optional Field in from dataframe used to make corpus to display in place of parsed text. Only can be used if corpus is a ParsedCorpus instance. Returns ------- str, html of visualization ''' color = None if singleScoreMode or word_vec_use_p_vals: color = 'd3.interpolatePurples' if singleScoreMode or not sort_by_dist: sort_by_dist = False else: sort_by_dist = True if term_ranker is None: term_ranker = termranking.AbsoluteFrequencyRanker if pmi_filter_thresold is not None: pmi_threshold_coefficient = pmi_filter_thresold warnings.warn( "The argument name 'pmi_filter_thresold' has been deprecated. Use 'pmi_threshold_coefficient' in its place", DeprecationWarning) scatter_chart_explorer = ScatterChartExplorer( corpus, minimum_term_frequency=minimum_term_frequency, minimum_not_category_term_frequency=minimum_not_category_term_frequency, pmi_threshold_coefficient=pmi_threshold_coefficient, filter_unigrams=filter_unigrams, jitter=jitter, max_terms=max_terms, term_ranker=term_ranker, use_non_text_features=use_non_text_features, term_significance=term_significance) if ((x_coords is None and y_coords is not None) or (y_coords is None and x_coords is not None)): raise Exception( "Both x_coords and y_coords need to be passed or both left blank") if x_coords is not None: scatter_chart_explorer.inject_coordinates(x_coords, y_coords) scatter_chart_data = scatter_chart_explorer.to_dict( category=category, category_name=category_name, not_category_name=not_category_name, transform=transform, scores=scores, max_docs_per_category=max_docs_per_category, metadata=metadata, alternative_text_field=alternative_text_field) return HTMLVisualizationAssembly(VizDataAdapter(scatter_chart_data), width_in_pixels=width_in_pixels, height_in_pixels=height_in_pixels, max_snippets=max_snippets, color=color, grey_zero_scores=grey_zero_scores, sort_by_dist=sort_by_dist, reverse_sort_scores_for_not_category=reverse_sort_scores_for_not_category, use_full_doc=use_full_doc, asian_mode=asian_mode, use_non_text_features=use_non_text_features, show_characteristic=show_characteristic, word_vec_use_p_vals=word_vec_use_p_vals, max_p_val=max_p_val, save_svg_button=save_svg_button, p_value_colors=p_value_colors, x_label=x_label, y_label=y_label) \ .to_html(protocol=protocol, d3_url=d3_url, d3_scale_chromatic_url=d3_scale_chromatic_url)
def produce_pairplot(corpus, asian_mode=False, category_width_in_pixels=500, category_height_in_pixels=700, term_width_in_pixels=500, term_height_in_pixels=700, terms_to_show=3000, scaler=scale_neg_1_to_1_with_zero_mean, term_ranker=AbsoluteFrequencyRanker, use_metadata=False, category_projector=CategoryProjector(), category_projection=None, topic_model_term_lists=None, topic_model_preview_size=10, metadata_descriptions=None, initial_category=None, x_dim=0, y_dim=1, show_halo=True, num_terms_in_halo=5, category_color_func='(function(x) {return "#5555FF"})', protocol='https', d3_url_struct=D3URLs(), **kwargs): if category_projection is None: if use_metadata: category_projection = category_projector.project_with_metadata(corpus, x_dim=x_dim, y_dim=y_dim) term_projection = category_projector else: category_projection = category_projector.project(corpus, x_dim=x_dim, y_dim=y_dim) if initial_category is None: initial_category = corpus.get_categories()[0] category_scatter_chart_explorer = ScatterChartExplorer(category_projection.category_corpus, minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, filter_unigrams=False, jitter=0, max_terms=None, term_ranker=term_ranker, use_non_text_features=True, term_significance=None, terms_to_include=None) proj_df = category_projection.get_pandas_projection() category_scatter_chart_explorer.inject_coordinates(x_coords=scaler(proj_df['x']), y_coords=scaler(proj_df['y']), original_x=proj_df['x'], original_y=proj_df['y']) category_scatter_chart_data = category_scatter_chart_explorer.to_dict( category=initial_category, max_docs_per_category=0, ) category_tooltip_func = '(function(d) {return d.term})' category_scatterplot_structure = ScatterplotStructure( VizDataAdapter(category_scatter_chart_data), width_in_pixels=category_width_in_pixels, height_in_pixels=category_height_in_pixels, asian_mode=asian_mode, use_non_text_features=True, show_top_terms=False, show_characteristic=False, get_tooltip_content=category_tooltip_func, color_func=category_color_func, show_axes=False, unified_context=True, show_category_headings=False, show_cross_axes=True, horizontal_line_y_position=0, vertical_line_x_position=0, y_label='', x_label='', full_data='getCategoryDataAndInfo()', alternative_term_func='(function (termInfo) {termPlotInterface.drawCategoryAssociation(termInfo.i); return false;})', div_name='cat-plot' ) compacted_corpus = AssociationCompactor(terms_to_show).compact(corpus) terms_to_hide = set(corpus.get_terms()) - set(compacted_corpus.get_terms()) print('num terms to hide', len(terms_to_hide)) print('num terms to show', compacted_corpus.get_num_terms()) term_scatter_chart_explorer = ScatterChartExplorer( corpus, minimum_term_frequency=0, minimum_not_category_term_frequency=0, pmi_threshold_coefficient=0, term_ranker=term_ranker, use_non_text_features=use_metadata, score_transform=stretch_0_to_1, ).hide_terms(terms_to_hide) if topic_model_term_lists is not None: term_scatter_chart_explorer.inject_metadata_term_lists(topic_model_term_lists) if metadata_descriptions is not None: term_scatter_chart_explorer.inject_metadata_descriptions(metadata_descriptions) if use_metadata: tdf = corpus.get_metadata_freq_df('') else: tdf = corpus.get_term_freq_df('') scores = RankDifference().get_scores( tdf[initial_category], tdf[[c for c in corpus.get_categories() if c != initial_category]].sum(axis=1) ) term_scatter_chart_data = term_scatter_chart_explorer.to_dict( category=initial_category, scores=scores, include_term_category_counts=True, transform=dense_rank, **kwargs ) term_scatterplot_structure = ScatterplotStructure( VizDataAdapter(term_scatter_chart_data), width_in_pixels=term_width_in_pixels, height_in_pixels=term_height_in_pixels, asian_mode=asian_mode, use_non_text_features=use_metadata, show_top_terms=True, show_characteristic=False, get_tooltip_content=None, show_category_headings=False, use_full_doc=use_metadata, horizontal_line_y_position=0, vertical_line_x_position=0, topic_model_preview_size=topic_model_preview_size, y_label=initial_category, x_label='Not ' + initial_category, full_data='getTermDataAndInfo()', div_name='d3-div-1', ) return PairPlotFromScatterplotStructure( category_scatterplot_structure, term_scatterplot_structure, category_projection, category_width_in_pixels, category_height_in_pixels, num_terms=num_terms_in_halo, show_halo=show_halo, d3_url_struct=d3_url_struct, x_dim=x_dim, y_dim=y_dim, protocol=protocol ).to_html()