コード例 #1
0
    def test_alternative_text(self):
        corpus = build_hamlet_jz_corpus_with_alt_text()
        j = (ScatterChartExplorer(corpus, minimum_term_frequency=0).to_dict(
            'hamlet', alternative_text_field='alt'))
        self.assertEqual(j['docs']['texts'][0], j['docs']['texts'][0].upper())

        j = (ScatterChartExplorer(corpus,
                                  minimum_term_frequency=0).to_dict('hamlet'))
        self.assertNotEqual(j['docs']['texts'][0],
                            j['docs']['texts'][0].upper())
コード例 #2
0
 def test_multi_categories(self):
     corpus = get_test_corpus()
     j_vs_all = ScatterChartExplorer(corpus=corpus, minimum_term_frequency=0) \
      .to_dict('hamlet')
     j_vs_swift = ScatterChartExplorer(corpus=corpus, minimum_term_frequency=0) \
      .to_dict('hamlet', not_categories=['swift'])
     self.assertNotEqual(
         set(j_vs_all['info']['not_category_internal_names']),
         set(j_vs_swift['info']['not_category_internal_names']))
     self.assertEqual(list(j_vs_all['docs']['labels']),
                      list(j_vs_swift['docs']['labels']))
     self.assertEqual(list(j_vs_all['docs']['categories']),
                      list(j_vs_swift['docs']['categories']))
コード例 #3
0
 def test_metadata(self):
     corpus = build_hamlet_jz_corpus()
     meta = ['one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight']
     j = (ScatterChartExplorer(corpus, minimum_term_frequency=0).to_dict(
         'hamlet', metadata=meta))
     self.maxDiff = None
     j['docs']['labels'] = list(j['docs']['labels'])
     self.assertEqual(
         j['docs'], {
             'labels': [0, 0, 0, 0, 1, 1, 1, 1],
             'categories': ['hamlet', 'jay-z/r. kelly'],
             'meta': [
                 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
                 'eight'
             ],
             'texts': [
                 "what art thou that usurp'st this time of night,",
                 'together with that fair and warlike form',
                 'in which the majesty of buried denmark',
                 'did sometimes march? by heaven i charge thee, speak!',
                 'halt! who goes there?',
                 'it is i sire tone from brooklyn.',
                 'well, speak up man what is it?',
                 'news from the east sire! the best of both worlds has returned!'
             ]
         })
コード例 #4
0
    def test_to_dict(self):
        np.random.seed(0)
        random.seed(0)
        corpus = build_hamlet_jz_corpus()
        j = (ScatterChartExplorer(corpus,
                                  minimum_term_frequency=0).to_dict('hamlet'))
        self.assertEqual(set(j.keys()), set(['info', 'data', 'docs']))
        self.assertEqual(
            set(j['info'].keys()),
            set([
                'not_category_name', 'category_name', 'category_terms',
                'not_category_internal_names', 'not_category_terms',
                'category_internal_name', 'categories',
                'neutral_category_name', 'extra_category_name',
                'neutral_category_internal_names',
                'extra_category_internal_names'
            ]))

        self.assertEqual(list(j['docs']['labels']), [0, 0, 0, 0, 1, 1, 1, 1])
        self.assertEqual(list(j['docs']['texts']), [
            "what art thou that usurp'st this time of night,",
            'together with that fair and warlike form',
            'in which the majesty of buried denmark',
            'did sometimes march? by heaven i charge thee, speak!',
            'halt! who goes there?', 'it is i sire tone from brooklyn.',
            'well, speak up man what is it?',
            'news from the east sire! the best of both worlds has returned!'
        ])
        expected = {
            'y': 0.5,
            'ncat': 0,
            'ncat25k': 0,
            'bg': 5,
            'cat': 1,
            's': 0.5,
            'term': 'art',
            'os': 0.5192,
            'extra': 0,
            'extra25k': 0,
            'cat25k': 758,
            'x': 0.06,
            'neut': 0,
            'neut25k': 0,
            'ox': 5,
            'oy': 3
        }

        actual = [t for t in j['data'] if t['term'] == 'art'][0]
        '''
		for var in expected.keys():
			try:
				#np.testing.assert_almost_equal(actual[var], expected[var],decimal=1)
			except TypeError:
				self.assertEqual(actual[var], expected[var])
		'''
        self.assertEqual(set(expected.keys()), set(actual.keys()))
        self.assertEqual(expected['term'], actual['term'])
        self.assertEqual(j['docs'].keys(), {'texts', 'labels', 'categories'})
コード例 #5
0
 def test_include_term_category_counts(self):
     corpus = build_hamlet_jz_corpus().get_unigram_corpus()
     j = (ScatterChartExplorer(corpus, minimum_term_frequency=0).to_dict(
         'hamlet', include_term_category_counts=True))
     self.assertEqual(set(j.keys()),
                      set(['info', 'data', 'docs', 'termCounts']))
     self.assertEqual(len(j['termCounts']), corpus.get_num_categories())
     term_idx_set = set()
     for cat_counts in j['termCounts']:
         term_idx_set |= set(cat_counts.keys())
         self.assertTrue(
             all([freq >= docs for freq, docs in cat_counts.values()]))
     self.assertEqual(len(term_idx_set), corpus.get_num_terms())
コード例 #6
0
 def test_hide_terms(self):
     corpus = build_hamlet_jz_corpus().get_unigram_corpus()
     terms_to_hide = ['thou', 'heaven']
     sc = (ScatterChartExplorer(
         corpus, minimum_term_frequency=0).hide_terms(terms_to_hide))
     self.assertEquals(type(sc), ScatterChartExplorer)
     j = sc.to_dict('hamlet', include_term_category_counts=True)
     self.assertTrue(
         all([
             'display' in t and t['display'] == False for t in j['data']
             if t['term'] in terms_to_hide
         ]))
     self.assertTrue(
         all([
             'display' not in t for t in j['data']
             if t['term'] not in terms_to_hide
         ]))
コード例 #7
0
def _get_category_scatter_chart_explorer(category_projection, scaler,
                                         term_ranker, verbose):
    category_scatter_chart_explorer = ScatterChartExplorer(
        category_projection.get_corpus(),
        minimum_term_frequency=0,
        minimum_not_category_term_frequency=0,
        pmi_threshold_coefficient=0,
        filter_unigrams=False,
        jitter=0,
        max_terms=None,
        # term_ranker=term_ranker,
        use_non_text_features=True,
        term_significance=None,
        terms_to_include=None,
        verbose=verbose)
    proj_df = category_projection.get_pandas_projection()
    category_scatter_chart_explorer.inject_coordinates(
        x_coords=scaler(proj_df['x']),
        y_coords=scaler(proj_df['y']),
        original_x=proj_df['x'],
        original_y=proj_df['y'])
    return category_scatter_chart_explorer
コード例 #8
0
ファイル: pairplot.py プロジェクト: xcgfth/scattertext
def produce_pairplot(corpus,
                     asian_mode=False,
                     category_width_in_pixels=500,
                     category_height_in_pixels=700,
                     term_width_in_pixels=500,
                     term_height_in_pixels=700,
                     terms_to_show=3000,
                     scaler=scale_neg_1_to_1_with_zero_mean,
                     term_ranker=AbsoluteFrequencyRanker,
                     use_metadata=False,
                     category_projector=CategoryProjector(),
                     category_projection=None,
                     topic_model_term_lists=None,
                     topic_model_preview_size=10,
                     metadata_descriptions=None,
                     initial_category=None,
                     x_dim=0,
                     y_dim=1,
                     show_halo=True,
                     num_terms_in_halo=5,
                     category_color_func='(function(x) {return "#5555FF"})',
                     protocol='https',
                     d3_url_struct=D3URLs(),
                     **kwargs):
    if category_projection is None:
        if use_metadata:
            category_projection = category_projector.project_with_metadata(
                corpus, x_dim=x_dim, y_dim=y_dim)
            term_projection = category_projector
        else:
            category_projection = category_projector.project(corpus,
                                                             x_dim=x_dim,
                                                             y_dim=y_dim)

    if initial_category is None:
        initial_category = corpus.get_categories()[0]

    category_scatter_chart_explorer = ScatterChartExplorer(
        category_projection.category_corpus,
        minimum_term_frequency=0,
        minimum_not_category_term_frequency=0,
        pmi_threshold_coefficient=0,
        filter_unigrams=False,
        jitter=0,
        max_terms=None,
        term_ranker=term_ranker,
        use_non_text_features=True,
        term_significance=None,
        terms_to_include=None)
    proj_df = category_projection.get_pandas_projection()
    category_scatter_chart_explorer.inject_coordinates(
        x_coords=scaler(proj_df['x']),
        y_coords=scaler(proj_df['y']),
        original_x=proj_df['x'],
        original_y=proj_df['y'])
    category_scatter_chart_data = category_scatter_chart_explorer.to_dict(
        category=initial_category,
        max_docs_per_category=0,
    )

    category_tooltip_func = '(function(d) {return d.term})'

    category_scatterplot_structure = ScatterplotStructure(
        VizDataAdapter(category_scatter_chart_data),
        width_in_pixels=category_width_in_pixels,
        height_in_pixels=category_height_in_pixels,
        asian_mode=asian_mode,
        use_non_text_features=True,
        show_top_terms=False,
        show_characteristic=False,
        get_tooltip_content=category_tooltip_func,
        color_func=category_color_func,
        show_axes=False,
        unified_context=True,
        show_category_headings=False,
        show_cross_axes=True,
        horizontal_line_y_position=0,
        vertical_line_x_position=0,
        y_label='',
        x_label='',
        full_data='getCategoryDataAndInfo()',
        alternative_term_func=
        '(function (termInfo) {termPlotInterface.drawCategoryAssociation(termInfo.i); return false;})',
        div_name='cat-plot')

    compacted_corpus = AssociationCompactor(terms_to_show).compact(corpus)
    terms_to_hide = set(corpus.get_terms()) - set(compacted_corpus.get_terms())
    print('num terms to hide', len(terms_to_hide))
    print('num terms to show', compacted_corpus.get_num_terms())

    term_scatter_chart_explorer = ScatterChartExplorer(
        corpus,
        minimum_term_frequency=0,
        minimum_not_category_term_frequency=0,
        pmi_threshold_coefficient=0,
        term_ranker=term_ranker,
        use_non_text_features=use_metadata,
        score_transform=stretch_0_to_1,
    ).hide_terms(terms_to_hide)

    if topic_model_term_lists is not None:
        term_scatter_chart_explorer.inject_metadata_term_lists(
            topic_model_term_lists)
    if metadata_descriptions is not None:
        term_scatter_chart_explorer.inject_metadata_descriptions(
            metadata_descriptions)

    if use_metadata:
        tdf = corpus.get_metadata_freq_df('')
    else:
        tdf = corpus.get_term_freq_df('')
    scores = RankDifference().get_scores(
        tdf[initial_category],
        tdf[[c for c in corpus.get_categories()
             if c != initial_category]].sum(axis=1))

    term_scatter_chart_data = term_scatter_chart_explorer.to_dict(
        category=initial_category,
        scores=scores,
        include_term_category_counts=True,
        transform=dense_rank,
        **kwargs)

    term_scatterplot_structure = ScatterplotStructure(
        VizDataAdapter(term_scatter_chart_data),
        width_in_pixels=term_width_in_pixels,
        height_in_pixels=term_height_in_pixels,
        asian_mode=asian_mode,
        use_non_text_features=use_metadata,
        show_top_terms=True,
        show_characteristic=False,
        get_tooltip_content=None,
        show_category_headings=False,
        use_full_doc=use_metadata,
        horizontal_line_y_position=0,
        vertical_line_x_position=0,
        topic_model_preview_size=topic_model_preview_size,
        y_label=initial_category,
        x_label='Not ' + initial_category,
        full_data='getTermDataAndInfo()',
        div_name='d3-div-1',
    )

    return PairPlotFromScatterplotStructure(category_scatterplot_structure,
                                            term_scatterplot_structure,
                                            category_projection,
                                            category_width_in_pixels,
                                            category_height_in_pixels,
                                            num_terms=num_terms_in_halo,
                                            show_halo=show_halo,
                                            d3_url_struct=d3_url_struct,
                                            x_dim=x_dim,
                                            y_dim=y_dim,
                                            protocol=protocol).to_html()
コード例 #9
0
def produce_scattertext_explorer(
        corpus,
        category,
        category_name=None,
        not_category_name=None,
        protocol='https',
        pmi_threshold_coefficient=DEFAULT_MINIMUM_TERM_FREQUENCY,
        minimum_term_frequency=DEFAULT_PMI_THRESHOLD_COEFFICIENT,
        minimum_not_category_term_frequency=0,
        max_terms=None,
        filter_unigrams=False,
        height_in_pixels=None,
        width_in_pixels=None,
        max_snippets=None,
        max_docs_per_category=None,
        metadata=None,
        scores=None,
        x_coords=None,
        y_coords=None,
        original_x=None,
        original_y=None,
        rescale_x=None,
        rescale_y=None,
        singleScoreMode=False,
        sort_by_dist=True,
        reverse_sort_scores_for_not_category=True,
        use_full_doc=False,
        transform=percentile_alphabetical,
        jitter=0,
        gray_zero_scores=False,
        term_ranker=None,
        asian_mode=False,
        use_non_text_features=False,
        show_top_terms=True,
        show_characteristic=True,
        word_vec_use_p_vals=False,
        max_p_val=0.1,
        p_value_colors=False,
        term_significance=None,
        save_svg_button=False,
        x_label=None,
        y_label=None,
        d3_url=None,
        d3_scale_chromatic_url=None,
        pmi_filter_thresold=None,
        alternative_text_field=None,
        terms_to_include=None,
        semiotic_square=None,
        num_terms_semiotic_square=None,
        not_categories=None,
        show_neutral=False,
        neutral_category_name=None,
        get_tooltip_content=None,
        x_axis_values=None,
        y_axis_values=None,
        color_func=None,
        term_scorer=None,
        show_axes=True):
    '''Returns html code of visualization.

	Parameters
	----------
	corpus : Corpus
		Corpus to use.
	category : str
		Name of category column as it appears in original data frame.
	category_name : str
		Name of category to use.  E.g., "5-star reviews."
		Optional, defaults to category name.
	not_category_name : str
		Name of everything that isn't in category.  E.g., "Below 5-star reviews".
		Optional defaults to "N(n)ot " + category_name, with the case of the 'n' dependent
		on the case of the first letter in category_name.
	protocol : str, optional
		Protocol to use.  Either http or https.  Default is https.
	pmi_threshold_coefficient : int, optional
		Filter out bigrams with a PMI of < 2 * pmi_threshold_coefficient. Default is 6
	minimum_term_frequency : int, optional
		Minimum number of times word needs to appear to make it into visualization.
	minimum_not_category_term_frequency : int, optional
	  If an n-gram does not occur in the category, minimum times it
	   must been seen to be included. Default is 0.
	max_terms : int, optional
		Maximum number of terms to include in visualization.
	filter_unigrams : bool, optional
		Default False, do we filter out unigrams that only occur in one bigram
	width_in_pixels : int, optional
		Width of viz in pixels, if None, default to JS's choice
	height_in_pixels : int, optional
		Height of viz in pixels, if None, default to JS's choice
	max_snippets : int, optional
		Maximum number of snippets to show when term is clicked.  If None, all are shown.
	max_docs_per_category: int, optional
		Maximum number of documents to store per category.  If None, by default, all are stored.
	metadata : list, optional
		list of meta data strings that will be included for each document
	scores : np.array, optional
		Array of term scores or None.
	x_coords : np.array, optional
		Array of term x-axis positions or None.  Must be in [0,1].
		If present, y_coords must also be present.
	y_coords : np.array, optional
		Array of term y-axis positions or None.  Must be in [0,1].
		If present, x_coords must also be present.
	original_x : array-like
		Original, unscaled x-values.  Defaults to x_coords
	original_y : array-like
		Original, unscaled y-values.  Defaults to y_coords
	rescale_x : lambda list[0,1]: list[0,1], optional
		Array of term x-axis positions or None.  Must be in [0,1].
		Rescales x-axis after filtering
	rescale_y : lambda list[0,1]: list[0,1], optional
		Array of term y-axis positions or None.  Must be in [0,1].
		Rescales y-axis after filtering
	singleScoreMode : bool, optional
		Label terms based on score vs distance from corner.  Good for topic scores. Show only one color.
	sort_by_dist: bool, optional
		Label terms based distance from corner. True by default.  Negated by singleScoreMode.
	reverse_sort_scores_for_not_category: bool, optional
		If using a custom score, score the not-category class by
		lowest-score-as-most-predictive. Turn this off for word vector
		or topic similarity. Default True.
	use_full_doc : bool, optional
		Use the full document in snippets.  False by default.
	transform : function, optional
		not recommended for editing.  change the way terms are ranked.  default is st.Scalers.percentile_ordinal
	jitter : float, optional
		percentage of axis to jitter each point.  default is 0.
	gray_zero_scores : bool, optional
		If True, color points with zero-scores a light shade of grey.  False by default.
	term_ranker : TermRanker, optional
		TermRanker class for determining term frequency ranks.
	asian_mode : bool, optional
		Use a special Javascript regular expression that's specific to chinese or japanese
	use_non_text_features : bool, optional
		Show non-bag-of-words features (e.g., Empath) instead of text.  False by default.
	show_top_terms : bool, default True
		Show top terms on the left-hand side of the visualization
	show_characteristic: bool, default True
		Show characteristic terms on the far left-hand side of the visualization
	word_vec_use_p_vals: bool, default False
		Sort by harmonic mean of score and distance.
	max_p_val : float, default 0.1
		If word_vec_use_p_vals, the minimum p val to use.
	p_value_colors : bool, default False
	  Color points differently if p val is above 1-max_p_val, below max_p_val, or
	   in between.
	term_significance : TermSignificance instance or None
		Way of getting signfiance scores.  If None, p values will not be added.
	save_svg_button : bool, default False
		Add a save as SVG button to the page.
	x_label : str, default None
		Custom x-axis label
	y_label : str, default None
		Custom y-axis label
	d3_url, str, None by default.  The url (or path) of d3.
		URL of d3, to be inserted into <script src="..."/>.  Overrides `protocol`.
	  By default, this is `DEFAULT_D3_URL` declared in `HTMLVisualizationAssembly`.
	d3_scale_chromatic_url, str, None by default.  Overrides `protocol`.
	  URL of d3 scale chromatic, to be inserted into <script src="..."/>
	  By default, this is `DEFAULT_D3_SCALE_CHROMATIC` declared in `HTMLVisualizationAssembly`.
	pmi_filter_thresold : (DEPRECATED) int, None by default
	  DEPRECATED.  Use pmi_threshold_coefficient instead.
	alternative_text_field : str or None, optional
		Field in from dataframe used to make corpus to display in place of parsed text. Only
		can be used if corpus is a ParsedCorpus instance.
	terms_to_include : list or None, optional
		Whitelist of terms to include in visualization.
	semiotic_square : SemioticSquare
		None by default.  SemioticSquare based on corpus.  Includes square above visualization.
	num_terms_semiotic_square : int
		10 by default. Number of terms to show in semiotic square.
		Only active if semiotic square is present.
	not_categories : list
		All categories other than category by default.  Documents labeled
		with remaining category.
	show_neutral : bool
		False by default.  Show a third column listing contexts in the
		neutral categories.
	neutral_category_name : str
		"Neutral" by default. Only active if show_neutral is True.  Name of the neutral
		column.
	get_tooltip_content : str
		Javascript function to control content of tooltip.  Function takes a parameter
		which is a dictionary entry produced by `ScatterChartExplorer.to_dict` and
		returns a string.
	x_axis_values : list, default None
		Value-labels to show on x-axis. Low, medium, high are defaults.
	y_axis_values : list, default None
		Value-labels to show on y-axis. Low, medium, high are defaults.
	color_func : str, default None
		Javascript function to control color of a point.  Function takes a parameter
		which is a dictionary entry produced by `ScatterChartExplorer.to_dict` and
		returns a string.
	term_scorer : Object, default None
		In lieu of scores, object with a get_scores(a,b) function that returns a set of scores,
		where a and b are term counts.  Scorer optionally has a get_term_freqs function.
	show_axes : bool, default True
		Show the ticked axes on the plot.  If false, show inner axes as a crosshair.
	Returns
	-------
	str
	html of visualization

	'''
    color = None
    if singleScoreMode or word_vec_use_p_vals:
        color = 'd3.interpolatePurples'
    if singleScoreMode or not sort_by_dist:
        sort_by_dist = False
    else:
        sort_by_dist = True
    if term_ranker is None:
        term_ranker = termranking.AbsoluteFrequencyRanker

    if category_name is None:
        category_name = category

    if not_category_name is None:
        if not_categories is not None and len(not_categories) == 1:
            not_category_name = not_categories[0]
        else:
            not_category_name = ('Not' if category_name[0].isupper() else
                                 'not') + ' ' + category_name

    if term_scorer:
        tdf = term_ranker(corpus).get_ranks()
        cat_freqs = tdf[category + ' freq']
        if not_categories:
            not_cat_freqs = tdf[[c + ' freq'
                                 for c in not_categories]].sum(axis=1)
        else:
            not_cat_freqs = tdf.sum(axis=1) - tdf[category]
        scores = term_scorer.get_scores(cat_freqs, not_cat_freqs)

    if pmi_filter_thresold is not None:
        pmi_threshold_coefficient = pmi_filter_thresold
        warnings.warn(
            "The argument name 'pmi_filter_thresold' has been deprecated. Use 'pmi_threshold_coefficient' in its place",
            DeprecationWarning)

    scatter_chart_explorer = ScatterChartExplorer(
        corpus,
        minimum_term_frequency=minimum_term_frequency,
        minimum_not_category_term_frequency=minimum_not_category_term_frequency,
        pmi_threshold_coefficient=pmi_threshold_coefficient,
        filter_unigrams=filter_unigrams,
        jitter=jitter,
        max_terms=max_terms,
        term_ranker=term_ranker,
        use_non_text_features=use_non_text_features,
        term_significance=term_significance,
        terms_to_include=terms_to_include)
    if ((x_coords is None and y_coords is not None)
            or (y_coords is None and x_coords is not None)):
        raise Exception(
            "Both x_coords and y_coords need to be passed or both left blank")
    if x_coords is not None:
        scatter_chart_explorer.inject_coordinates(x_coords,
                                                  y_coords,
                                                  rescale_x=rescale_x,
                                                  rescale_y=rescale_y,
                                                  original_x=original_x,
                                                  original_y=original_y)
    html_base = None
    if semiotic_square:
        html_base = get_semiotic_square_html(num_terms_semiotic_square,
                                             semiotic_square)
    scatter_chart_data = scatter_chart_explorer.to_dict(
        category=category,
        category_name=category_name,
        not_category_name=not_category_name,
        not_categories=not_categories,
        transform=transform,
        scores=scores,
        max_docs_per_category=max_docs_per_category,
        metadata=metadata,
        alternative_text_field=alternative_text_field,
        neutral_category_name=neutral_category_name)
    return HTMLVisualizationAssembly(VizDataAdapter(scatter_chart_data),
                                     width_in_pixels=width_in_pixels,
                                     height_in_pixels=height_in_pixels,
                                     max_snippets=max_snippets,
                                     color=color,
                                     grey_zero_scores=gray_zero_scores,
                                     sort_by_dist=sort_by_dist,
                                     reverse_sort_scores_for_not_category=reverse_sort_scores_for_not_category,
                                     use_full_doc=use_full_doc,
                                     asian_mode=asian_mode,
                                     use_non_text_features=use_non_text_features,
                                     show_characteristic=show_characteristic,
                                     show_top_terms=show_top_terms,
                                     word_vec_use_p_vals=word_vec_use_p_vals,
                                     max_p_val=max_p_val,
                                     save_svg_button=save_svg_button,
                                     p_value_colors=p_value_colors,
                                     x_label=x_label,
                                     y_label=y_label,
                                     show_neutral=show_neutral,
                                     get_tooltip_content=get_tooltip_content,
                                     x_axis_values=x_axis_values,
                                     y_axis_values=y_axis_values,
                                     color_func=color_func,
                                     show_axes=show_axes) \
     .to_html(protocol=protocol,
               d3_url=d3_url,
               d3_scale_chromatic_url=d3_scale_chromatic_url,
               html_base=html_base)
コード例 #10
0
def produce_pairplot(
        corpus,
        asian_mode=False,
        category_width_in_pixels=500,
        category_height_in_pixels=700,
        term_width_in_pixels=500,
        term_height_in_pixels=700,
        terms_to_show=3000,
        scaler=scale_neg_1_to_1_with_zero_mean,
        term_ranker=AbsoluteFrequencyRanker,
        use_metadata=False,
        category_projector=CategoryProjector(),
        category_projection=None,
        topic_model_term_lists=None,
        topic_model_preview_size=10,
        metadata_descriptions=None,
        initial_category=None,
        x_dim=0,
        y_dim=1,
        show_halo=True,
        num_terms_in_halo=5,
        category_color_func='(function(x) {return "#5555FF"})',
        protocol='https',
        d3_url_struct=D3URLs(),
        category_focused=False,
        verbose=False,
        use_full_doc=True,
        default_to_term_comparison=True,
        category_x_label='',
        category_y_label='',
        category_show_axes_and_cross_hairs=False,
        highlight_selected_category=True,
        term_x_label=None,  # used if default_to_term_comparison
        term_y_label=None,  # used if default_to_term_comparison
        wordfish_style=False,
        **kwargs):
    if category_projection is None:
        if use_metadata:
            category_projection = category_projector.project_with_metadata(
                corpus, x_dim=x_dim, y_dim=y_dim)
        else:
            category_projection = category_projector.project(corpus,
                                                             x_dim=x_dim,
                                                             y_dim=y_dim)

    if initial_category is None:
        initial_category = corpus.get_categories()[0]
    category_scatter_chart_explorer = _get_category_scatter_chart_explorer(
        category_projection, scaler, term_ranker, verbose)
    category_scatter_chart_data = category_scatter_chart_explorer.to_dict(
        category=initial_category,
        max_docs_per_category=0,
    )

    category_tooltip_func = '(function(d) {return d.term})'

    initial_category_idx = corpus.get_categories().index(initial_category)
    term_plot_change_func = _get_term_plot_change_js_func(
        wordfish_style, category_focused, initial_category_idx)

    category_scatterplot_structure = ScatterplotStructure(
        VizDataAdapter(category_scatter_chart_data),
        width_in_pixels=category_width_in_pixels,
        height_in_pixels=category_height_in_pixels,
        asian_mode=asian_mode,
        use_non_text_features=True,
        show_characteristic=False,
        x_label=category_x_label,
        y_label=category_y_label,
        show_axes_and_cross_hairs=category_show_axes_and_cross_hairs,
        full_data='getCategoryDataAndInfo()',
        show_top_terms=False,
        get_tooltip_content=category_tooltip_func,
        color_func=category_color_func,
        show_axes=False,
        horizontal_line_y_position=0,
        vertical_line_x_position=0,
        unified_context=True,
        show_category_headings=False,
        show_cross_axes=True,
        div_name='cat-plot',
        alternative_term_func=term_plot_change_func,
        highlight_selected_category=highlight_selected_category)
    compacted_corpus = AssociationCompactor(
        terms_to_show, use_non_text_features=use_metadata).compact(corpus)
    terms_to_hide = set(corpus.get_terms()) - set(compacted_corpus.get_terms())
    if verbose:
        print('num terms to hide', len(terms_to_hide))
        print('num terms to show', compacted_corpus.get_num_terms())

    term_scatter_chart_explorer = ScatterChartExplorer(
        category_projection.get_corpus(),
        minimum_term_frequency=0,
        minimum_not_category_term_frequency=0,
        pmi_threshold_coefficient=0,
        term_ranker=term_ranker,
        use_non_text_features=use_metadata,
        score_transform=stretch_0_to_1,
        verbose=verbose).hide_terms(terms_to_hide)

    if default_to_term_comparison:
        if topic_model_term_lists is not None:
            term_scatter_chart_explorer.inject_metadata_term_lists(
                topic_model_term_lists)
        if metadata_descriptions is not None:
            term_scatter_chart_explorer.inject_metadata_descriptions(
                metadata_descriptions)

        if use_metadata:
            tdf = corpus.get_metadata_freq_df('')
        else:
            tdf = corpus.get_term_freq_df('')

        scores = RankDifference().get_scores(
            tdf[initial_category],
            tdf[[c for c in corpus.get_categories()
                 if c != initial_category]].sum(axis=1))

        term_scatter_chart_data = term_scatter_chart_explorer.to_dict(
            category=initial_category,
            scores=scores,
            include_term_category_counts=True,
            transform=dense_rank,
            **kwargs)
        y_label = initial_category,
        x_label = 'Not ' + initial_category,
        color_func = None
        show_top_terms = True
        show_axes = False
    else:
        term_projection = category_projection.get_term_projection()
        original_x = term_projection['x']
        original_y = term_projection['y']
        x_coords = scaler(term_projection['x'])
        y_coords = scaler(term_projection['y'])
        x_label = term_x_label if term_x_label is not None else ''
        y_label = term_y_label if term_y_label is not None else ''
        show_axes = True
        horizontal_line_y_position = 0
        vertical_line_x_position = 0
        term_scatter_chart_explorer.inject_coordinates(x_coords,
                                                       y_coords,
                                                       original_x=original_x,
                                                       original_y=original_y)

        if topic_model_term_lists is not None:
            term_scatter_chart_explorer.inject_metadata_term_lists(
                topic_model_term_lists)
        if metadata_descriptions is not None:
            term_scatter_chart_explorer.inject_metadata_descriptions(
                metadata_descriptions)
        term_scatter_chart_data = term_scatter_chart_explorer.to_dict(
            category=initial_category,
            category_name=initial_category,
            include_term_category_counts=True,
            # transform=dense_rank,
        )
        color_func = '(function(x) {return "#5555FF"})'
        show_top_terms = False

    term_scatterplot_structure = ScatterplotStructure(
        VizDataAdapter(term_scatter_chart_data),
        width_in_pixels=term_width_in_pixels,
        height_in_pixels=term_height_in_pixels,
        use_full_doc=use_metadata or use_full_doc,
        asian_mode=asian_mode,
        use_non_text_features=use_metadata,
        show_characteristic=False,
        x_label=x_label,
        y_label=y_label,
        full_data='getTermDataAndInfo()',
        show_top_terms=show_top_terms,
        get_tooltip_content=None,
        color_func=color_func,
        # horizontal_line_y_position=0,
        # vertical_line_x_position=0,
        show_axes=show_axes,
        topic_model_preview_size=topic_model_preview_size,
        show_category_headings=False,
        div_name='d3-div-1',
        unified_context=True,
        highlight_selected_category=highlight_selected_category)
    return PairPlotFromScatterplotStructure(category_scatterplot_structure,
                                            term_scatterplot_structure,
                                            category_projection,
                                            category_width_in_pixels,
                                            category_height_in_pixels,
                                            num_terms=num_terms_in_halo,
                                            show_halo=show_halo,
                                            d3_url_struct=d3_url_struct,
                                            x_dim=x_dim,
                                            y_dim=y_dim,
                                            protocol=protocol).to_html()
コード例 #11
0
def produce_scattertext_explorer(corpus,
                                 category,
                                 category_name,
                                 not_category_name,
                                 protocol='https',
                                 pmi_filter_thresold=2,
                                 minimum_term_frequency=3,
                                 minimum_not_category_term_frequency=0,
                                 max_terms=None,
                                 filter_unigrams=False,
                                 height_in_pixels=None,
                                 width_in_pixels=None,
                                 max_snippets=None,
                                 max_docs_per_category=None,
                                 metadata=None,
                                 scores=None,
                                 singleScoreMode=False,
                                 sort_by_dist=True,
                                 reverse_sort_scores_for_not_category=True,
                                 use_full_doc=False,
                                 transform=percentile_alphabetical,
                                 jitter=0,
                                 grey_zero_scores=False,
                                 term_ranker=None,
                                 chinese_mode=False,
                                 use_non_text_features=False,
                                 show_characteristic=True,
                                 word_vec_use_p_vals=False,
                                 max_p_val=0.05,
                                 p_value_colors=False,
                                 term_significance=None,
                                 save_svg_button=False):
    '''Returns html code of visualization.

	Parameters
	----------
	corpus : Corpus
		Corpus to use.
	category : str
		Name of category column as it appears in original data frame.
	category_name : str
		Name of category to use.  E.g., "5-star reviews."
	not_category_name : str
		Name of everything that isn't in category.  E.g., "Below 5-star reviews".
	protocol : str, optional
		Protocol to use.  Either http or https.  Default is https.
	minimum_term_frequency : int, optional
		Minimum number of times word needs to appear to make it into visualization.
	minimum_not_category_term_frequency : int, optional
	  If an n-gram does not occur in the category, minimum times it
	   must been seen to be included. Default is 0.
	max_terms : int, optional
		Maximum number of terms to include in visualization.
	filter_unigrams : bool, optional
		Default False, do we filter out unigrams that only occur in one bigram
	width_in_pixels : int, optional
		Width of viz in pixels, if None, default to JS's choice
	height_in_pixels : int, optional
		Height of viz in pixels, if None, default to JS's choice
  max_snippets : int, optional
    Maximum number of snippets to show when term is clicked.  If None, all are shown.
  max_docs_per_category: int, optional
    Maximum number of documents to store per category.  If None, by default, all are stored.
	metadata : list, optional
		list of meta data strings that will be included for each document
	scores : np.array, optional
		Array of term scores or None.
	singleScoreMode : bool, optional
		Label terms based on score vs distance from corner.  Good for topic scores. Show only one color.
	sort_by_dist: bool, optional
		Label terms based distance from corner. True by default.  Negated by singleScoreMode.
	reverse_sort_scores_for_not_category: bool, optional
		If using a custom score, score the not-category class by
		lowest-score-as-most-predictive. Turn this off for word vectory
		or topic similarity. Default True.
	use_full_doc : bool, optional
		Use the full document in snippets.  False by default.
	transform : function, optional
		not recommended for editing.  change the way terms are ranked.  default is st.Scalers.percentile_ordinal
	jitter : float, optional
		percentage of axis to jitter each point.  default is 0.
	grey_zero_scores : bool, optional
		If True, color points with zero-scores a light shade of grey.  False by default.
	term_ranker : TermRanker, optional
		TermRanker class for determining term frequency ranks.
	chinese_mode : bool, optional
		Use a special Javascript regular expression that's specific to chinese
	use_non_text_features : bool, optional
		Show non-bag-of-words features (e.g., Empath) instaed of text.  False by default.
	show_characteristic: bool, default True
		Show characteristic terms on the far left-hand side of the visualization
	word_vec_use_p_vals: bool, default False
		Sort by harmonic mean of score and distance.
	max_p_val : float, default 0.05
		If word_vec_use_p_vals, the minimum p val to use.
	p_value_colors : bool, default False
	  Color points differently if p val is above 1-max_p_val, below max_p_val, or
	   in between.
	p_value_colors : false
	term_significance : TermSignifiance instance or None
		Way of getting signfiance scores.  If None, p values will not be added.
	save_svg_button : bool, default False
		Add a save as SVG button to the page.
	Returns
	-------
		str, html of visualization

	'''
    color = None
    if singleScoreMode or word_vec_use_p_vals:
        color = 'd3.interpolatePurples'
    if singleScoreMode or not sort_by_dist:
        sort_by_dist = False
    else:
        sort_by_dist = True
    if term_ranker is None:
        term_ranker = termranking.AbsoluteFrequencyRanker

    scatter_chart_explorer = ScatterChartExplorer(
        corpus,
        minimum_term_frequency=minimum_term_frequency,
        minimum_not_category_term_frequency=minimum_not_category_term_frequency,
        pmi_threshold_coefficient=pmi_filter_thresold,
        filter_unigrams=filter_unigrams,
        jitter=jitter,
        max_terms=max_terms,
        term_ranker=term_ranker,
        use_non_text_features=use_non_text_features,
        term_significance=term_significance)
    scatter_chart_data = scatter_chart_explorer.to_dict(
        category=category,
        category_name=category_name,
        not_category_name=not_category_name,
        transform=transform,
        scores=scores,
        max_docs_per_category=max_docs_per_category,
        metadata=metadata)
    return HTMLVisualizationAssembly(VizDataAdapter(scatter_chart_data),
                                     width_in_pixels=width_in_pixels,
                                     height_in_pixels=height_in_pixels,
                                     max_snippets=max_snippets,
                                     color=color,
                                     grey_zero_scores=grey_zero_scores,
                                     sort_by_dist=sort_by_dist,
                                     reverse_sort_scores_for_not_category=reverse_sort_scores_for_not_category,
                                     use_full_doc=use_full_doc,
                                     chinese_mode=chinese_mode,
                                     use_non_text_features=use_non_text_features,
                                     show_characteristic=show_characteristic,
                                     word_vec_use_p_vals=word_vec_use_p_vals,
                                     max_p_val=max_p_val,
                                     save_svg_button=save_svg_button,
                                     p_value_colors=p_value_colors) \
     .to_html(protocol=protocol)
コード例 #12
0
ファイル: __init__.py プロジェクト: rhetorchkim/scattertext
def produce_scattertext_explorer(corpus,
                                 category,
                                 category_name,
                                 not_category_name,
                                 protocol='https',
                                 pmi_threshold_coefficient=6,
                                 minimum_term_frequency=3,
                                 minimum_not_category_term_frequency=0,
                                 max_terms=None,
                                 filter_unigrams=False,
                                 height_in_pixels=None,
                                 width_in_pixels=None,
                                 max_snippets=None,
                                 max_docs_per_category=None,
                                 metadata=None,
                                 scores=None,
                                 x_coords=None,
                                 y_coords=None,
                                 singleScoreMode=False,
                                 sort_by_dist=True,
                                 reverse_sort_scores_for_not_category=True,
                                 use_full_doc=False,
                                 transform=percentile_alphabetical,
                                 jitter=0,
                                 grey_zero_scores=False,
                                 term_ranker=None,
                                 asian_mode=False,
                                 use_non_text_features=False,
                                 show_characteristic=True,
                                 word_vec_use_p_vals=False,
                                 max_p_val=0.1,
                                 p_value_colors=False,
                                 term_significance=None,
                                 save_svg_button=False,
                                 x_label=None,
                                 y_label=None,
                                 d3_url=None,
                                 d3_scale_chromatic_url=None,
                                 pmi_filter_thresold=None,
                                 alternative_text_field=None):
    '''Returns html code of visualization.

	Parameters
	----------
	corpus : Corpus
		Corpus to use.
	category : str
		Name of category column as it appears in original data frame.
	category_name : str
		Name of category to use.  E.g., "5-star reviews."
	not_category_name : str
		Name of everything that isn't in category.  E.g., "Below 5-star reviews".
	protocol : str, optional
		Protocol to use.  Either http or https.  Default is https.
	pmi_threshold_coefficient : int, optional
		Filter out bigrams with a PMI of < 2 * pmi_threshold_coefficient. Default is 6
	minimum_term_frequency : int, optional
		Minimum number of times word needs to appear to make it into visualization.
	minimum_not_category_term_frequency : int, optional
	  If an n-gram does not occur in the category, minimum times it
	   must been seen to be included. Default is 0.
	max_terms : int, optional
		Maximum number of terms to include in visualization.
	filter_unigrams : bool, optional
		Default False, do we filter out unigrams that only occur in one bigram
	width_in_pixels : int, optional
		Width of viz in pixels, if None, default to JS's choice
	height_in_pixels : int, optional
		Height of viz in pixels, if None, default to JS's choice
	max_snippets : int, optional
		Maximum number of snippets to show when term is clicked.  If None, all are shown.
	max_docs_per_category: int, optional
		Maximum number of documents to store per category.  If None, by default, all are stored.
	metadata : list, optional
		list of meta data strings that will be included for each document
	scores : np.array, optional
		Array of term scores or None.
	x_coords : np.array, optional
		Array of term x-axis positions or None.  Must be in [0,1].
		If present, y_coords must also be present.
	y_coords : np.array, optional
		Array of term y-axis positions or None.  Must be in [0,1].
		If present, x_coords must also be present.
	singleScoreMode : bool, optional
		Label terms based on score vs distance from corner.  Good for topic scores. Show only one color.
	sort_by_dist: bool, optional
		Label terms based distance from corner. True by default.  Negated by singleScoreMode.
	reverse_sort_scores_for_not_category: bool, optional
		If using a custom score, score the not-category class by
		lowest-score-as-most-predictive. Turn this off for word vectory
		or topic similarity. Default True.
	use_full_doc : bool, optional
		Use the full document in snippets.  False by default.
	transform : function, optional
		not recommended for editing.  change the way terms are ranked.  default is st.Scalers.percentile_ordinal
	jitter : float, optional
		percentage of axis to jitter each point.  default is 0.
	grey_zero_scores : bool, optional
		If True, color points with zero-scores a light shade of grey.  False by default.
	term_ranker : TermRanker, optional
		TermRanker class for determining term frequency ranks.
	asian_mode : bool, optional
		Use a special Javascript regular expression that's specific to chinese or japanese
	use_non_text_features : bool, optional
		Show non-bag-of-words features (e.g., Empath) instaed of text.  False by default.
	show_characteristic: bool, default True
		Show characteristic terms on the far left-hand side of the visualization
	word_vec_use_p_vals: bool, default False
		Sort by harmonic mean of score and distance.
	max_p_val : float, default 0.1
		If word_vec_use_p_vals, the minimum p val to use.
	p_value_colors : bool, default False
	  Color points differently if p val is above 1-max_p_val, below max_p_val, or
	   in between.
	term_significance : TermSignifiance instance or None
		Way of getting signfiance scores.  If None, p values will not be added.
	save_svg_button : bool, default False
		Add a save as SVG button to the page.
	x_label : str, default None
		Custom x-axis label
	y_label : str, default None
		Custom y-axis label
	d3_url, str, None by default.  The url (or path) of d3.
		URL of d3, to be inserted into <script src="..."/>.  Overrides `protocol`.
	  By default, this is `DEFAULT_D3_URL` declared in `HTMLVisualizationAssembly`.
	d3_scale_chromatic_url, str, None by default.  Overrides `protocol`.
	  URL of d3 scale chromatic, to be inserted into <script src="..."/>
	  By default, this is `DEFAULT_D3_SCALE_CHROMATIC` declared in `HTMLVisualizationAssembly`.
	pmi_filter_thresold : (DEPRECATED) int, None by default
	  DEPRECATED.  Use pmi_threshold_coefficient instead.
	alternative_text_field : str or None, optional
		Field in from dataframe used to make corpus to display in place of parsed text. Only
		can be used if corpus is a ParsedCorpus instance.


	Returns
	-------
		str, html of visualization

	'''
    color = None
    if singleScoreMode or word_vec_use_p_vals:
        color = 'd3.interpolatePurples'
    if singleScoreMode or not sort_by_dist:
        sort_by_dist = False
    else:
        sort_by_dist = True
    if term_ranker is None:
        term_ranker = termranking.AbsoluteFrequencyRanker

    if pmi_filter_thresold is not None:
        pmi_threshold_coefficient = pmi_filter_thresold
        warnings.warn(
            "The argument name 'pmi_filter_thresold' has been deprecated. Use 'pmi_threshold_coefficient' in its place",
            DeprecationWarning)

    scatter_chart_explorer = ScatterChartExplorer(
        corpus,
        minimum_term_frequency=minimum_term_frequency,
        minimum_not_category_term_frequency=minimum_not_category_term_frequency,
        pmi_threshold_coefficient=pmi_threshold_coefficient,
        filter_unigrams=filter_unigrams,
        jitter=jitter,
        max_terms=max_terms,
        term_ranker=term_ranker,
        use_non_text_features=use_non_text_features,
        term_significance=term_significance)
    if ((x_coords is None and y_coords is not None)
            or (y_coords is None and x_coords is not None)):
        raise Exception(
            "Both x_coords and y_coords need to be passed or both left blank")
    if x_coords is not None:
        scatter_chart_explorer.inject_coordinates(x_coords, y_coords)
    scatter_chart_data = scatter_chart_explorer.to_dict(
        category=category,
        category_name=category_name,
        not_category_name=not_category_name,
        transform=transform,
        scores=scores,
        max_docs_per_category=max_docs_per_category,
        metadata=metadata,
        alternative_text_field=alternative_text_field)
    return HTMLVisualizationAssembly(VizDataAdapter(scatter_chart_data),
                                     width_in_pixels=width_in_pixels,
                                     height_in_pixels=height_in_pixels,
                                     max_snippets=max_snippets,
                                     color=color,
                                     grey_zero_scores=grey_zero_scores,
                                     sort_by_dist=sort_by_dist,
                                     reverse_sort_scores_for_not_category=reverse_sort_scores_for_not_category,
                                     use_full_doc=use_full_doc,
                                     asian_mode=asian_mode,
                                     use_non_text_features=use_non_text_features,
                                     show_characteristic=show_characteristic,
                                     word_vec_use_p_vals=word_vec_use_p_vals,
                                     max_p_val=max_p_val,
                                     save_svg_button=save_svg_button,
                                     p_value_colors=p_value_colors,
                                     x_label=x_label,
                                     y_label=y_label) \
     .to_html(protocol=protocol,
               d3_url=d3_url,
               d3_scale_chromatic_url=d3_scale_chromatic_url)