Example #1
0
    def test_max_terms(self):
        tdm = build_hamlet_jz_term_doc_mat()
        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=2)
             .to_dict('hamlet'))
        self.assertEqual(2, len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=10)
             .to_dict('hamlet'))
        self.assertEqual(10, len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          pmi_threshold_coefficient=0,
                          max_terms=10000)
             .to_dict('hamlet'))
        self.assertEqual(len(tdm.get_term_freq_df()), len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          pmi_threshold_coefficient=0,
                          max_terms=None)
             .to_dict('hamlet'))
        self.assertEqual(len(tdm.get_term_freq_df()), len(j['data']))
 def test_score_transform(self):
     corpus = get_test_corpus()
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0)
     d1 = sc.to_dict('hamlet')
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0, score_transform=lambda x:x)
     d2 = sc.to_dict('hamlet')
     assert sum([datum['s'] for datum in d1['data']]) != sum([datum['s'] for datum in d2['data']])
Example #3
0
 def __init__(self, corpus, **kwargs):
     '''See ScatterChart.  This lets you click on terms to see what contexts they tend to appear in.
     Running the `to_dict` function outputs
     '''
     assert (isinstance(corpus, Corpus)) or (isinstance(
         corpus, TermCategoryFrequencies))
     ScatterChart.__init__(self, corpus, **kwargs)
Example #4
0
 def test_inject_term_colors(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     scatter_chart.inject_term_colors({'t1': '00ffee'})
     j = scatter_chart.to_dict('hamlet')
     self.assertIn('term_colors', j['info'])
Example #5
0
	def __init__(self,
	             corpus,
	             **kwargs):
		'''See ScatterChart.  This lets you click on terms to see what contexts they tend to appear in.

		'''
		assert isinstance(corpus, Corpus)
		ScatterChart.__init__(self, corpus, **kwargs)
 def test_inject_term_colors(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     scatter_chart.inject_term_colors({'t1': '00ffee'})
     j = scatter_chart.to_dict('hamlet')
     self.assertIn('term_colors', j['info'])
Example #7
0
 def _add_term_freq_to_json_df(self, json_df, term_freq_df, category):
     ScatterChart._add_term_freq_to_json_df(self, json_df, term_freq_df,
                                            category)
     json_df['cat'] = term_freq_df[category + ' freq'].astype(np.int)
     json_df['ncat'] = term_freq_df['not cat freq'].astype(np.int)
     if self._term_metadata is not None:
         json_df['etc'] = term_freq_df['term'].apply(
             lambda term: self._term_metadata.get(term, {}))
Example #8
0
 def test_multi_categories(self):
     corpus = get_test_corpus()
     j_vs_all = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) \
         .to_dict('hamlet')
     j_vs_swift = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) \
         .to_dict('hamlet', not_categories=['swift'])
     self.assertNotEqual(set(j_vs_all['info']['not_category_internal_names']),
                         set(j_vs_swift['info']['not_category_internal_names']))
     self.assertEqual(j_vs_all['info']['categories'], corpus.get_categories())
     self.assertEqual(j_vs_swift['info']['categories'], corpus.get_categories())
Example #9
0
 def __init__(self, corpus, verbose=False, **kwargs):
     '''See ScatterChart.  This lets you click on terms to see what contexts they tend to appear in.
     Running the `to_dict` function outputs
     '''
     # if not (isinstance(corpus, (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies))
     #        or (issubclass(type(corpus), (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies)))):
     #    raise AssertionError(corpus, 'of type', type(corpus),
     #                         'must be a subclass of Corpus or TermCategoryFrequencies.')
     ScatterChart.__init__(self, corpus, verbose, **kwargs)
     self._term_metadata = None
 def __init__(self,
              corpus,
              **kwargs):
     '''See ScatterChart.  This lets you click on terms to see what contexts they tend to appear in.
     Running the `to_dict` function outputs
     '''
     #if not (isinstance(corpus, (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies))
     #        or (issubclass(type(corpus), (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies)))):
     #    raise AssertionError(corpus, 'of type', type(corpus),
     #                         'must be a subclass of Corpus or TermCategoryFrequencies.')
     ScatterChart.__init__(self, corpus, **kwargs)
     self._term_metadata = None
Example #11
0
 def test_title_case_names(self):
     tdm = build_hamlet_jz_term_doc_mat()
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0)
          .to_dict('hamlet', 'HAMLET', 'NOT HAMLET'))
     self.assertEqual(j['info']['category_name'], 'HAMLET')
     self.assertEqual(j['info']['not_category_name'], 'NOT HAMLET')
     tdm = build_hamlet_jz_term_doc_mat()
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0)
          .to_dict('hamlet', 'HAMLET', 'NOT HAMLET', title_case_names=True))
     self.assertEqual(j['info']['category_name'], 'Hamlet')
     self.assertEqual(j['info']['not_category_name'], 'Not Hamlet')
 def test_inject_metadata_descriptions(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0)
     with self.assertRaises(AssertionError):
         scatter_chart.inject_metadata_descriptions(3323)
     if (sys.version_info > (3, 0)):
         '''
         with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: blah'):
             scatter_chart.inject_metadata_descriptions({'blah': 'asjdkflasdjklfsadjk jsdkafsd'})
         with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: cat2'):
             scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd', 'cat2': 'asdf'})
         '''
     assert scatter_chart == scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd'})
     j = scatter_chart.to_dict('hamlet')
     self.assertEqual(set(j.keys()), set(['info', 'data', 'metadescriptions']))
 def test_inject_coordinates_original(self):
     tdm = build_hamlet_jz_term_doc_mat()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     x = freq_df[freq_df.columns[1]].astype(np.float)
     y = freq_df[freq_df.columns[0]].astype(np.float)
     scatter_chart.inject_coordinates(x / x.max(), y / y.max(), original_x=x, original_y=y)
     j = scatter_chart.to_dict('hamlet')
     self.assertEqual(j['data'][0].keys(),
                      {'x', 'os', 'y', 'ncat25k', 'neut', 'cat25k', 'ox', 'neut25k', 'extra25k', 'extra', 'oy',
                       'term',
                       's', 'bg'})
     and_term = [t for t in j['data'] if t['term'] == 'and'][0]
     self.assertEqual(and_term['ox'], 0)
     self.assertEqual(and_term['oy'], 1)
Example #14
0
	def to_dict(self,
	            category,
	            category_name=None,
	            not_category_name=None,
	            scores=None,
	            metadata=None,
	            max_docs_per_category=None,
	            transform=percentile_alphabetical,
	            alternative_text_field=None,
	            title_case_names=False):
		'''

		Parameters
		----------
		category : str
			Category to annotate.  Exact value of category.
		category_name : str, optional
			Name of category which will appear on web site. Default None is same as category.
		not_category_name : str, optional
			Name of ~category which will appear on web site. Default None is same as "not " + category.
		scores : np.array, optional
			Scores to use for coloring.  Defaults to None, or np.array(self.term_doc_matrix.get_scaled_f_scores(category))
		metadata, None or array-like.
		  List of metadata for each document.  Defaults to a list of blank strings.
		max_docs_per_category, None or int, optional
		  Maximum number of documents to store per category.  Defaults to 4.
		transform : function, optional
			Function for ranking terms.  Defaults to scattertext.Scalers.percentile_lexicographic.
		alternative_text_field : str or None, optional
			Field in from dataframe used to make corpus to display in place of parsed text. Only
			can be used if corpus is a ParsedCorpus instance.
		title_case_names : bool, default False
		  Should the program title-case the category and not-category names?
		Returns
		-------
		dictionary {info: {category_name: ..., not_category_name},
												 docs: {'texts': [doc1text, ...],
												        'labels': [1, 0, ...],
												        'meta': ['<b>blah</b>', '<b>blah</b>']}
		                     data: {term:, x:frequency [0-1], y:frequency [0-1],
		                            s: score,
		                            bg: background score,
		                            as: association score,
		                            cat25k: freq per 25k in category,
		                            cat: count in category,
		                            ncat: count in non-category,
		                            catdocs: [docnum, ...],
		                            ncatdocs: [docnum, ...]
		                            ncat25k: freq per 25k in non-category}}
		'''
		j = ScatterChart.to_dict(self,
		                         category,
		                         category_name=category_name,
		                         not_category_name=not_category_name,
		                         scores=scores,
		                         transform=transform,
		                         title_case_names=title_case_names)
		docs_getter = self._make_docs_getter(max_docs_per_category, alternative_text_field)
		j['docs'] = self._get_docs_structure(docs_getter, metadata)
		return j
Example #15
0
 def test_to_json(self):
     tdm = build_hamlet_jz_term_doc_mat()
     # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError):
     #	ScatterChart(term_doc_matrix=tdm).to_dict('hamlet')
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0).to_dict('hamlet'))
     self.assertEqual(set(j.keys()), set(['info', 'data']))
     self.assertEqual(
         set(j['info'].keys()),
         set([
             'not_category_name', 'category_name', 'category_terms',
             'not_category_terms', 'category_internal_name',
             'not_category_internal_names', 'categories'
         ]))
     expected = {
         "x": 0.0,
         "y": 0.42,
         'ox': 0,
         'oy': 0.42,
         "term": "art",
         "cat25k": 758,
         "ncat25k": 0,
         "neut25k": 0,
         'neut': 0,
         's': 0.5,
         'os': 3,
         'bg': 3
     }
     datum = self._get_data_example(j)
     for var in ['cat25k', 'ncat25k']:
         np.testing.assert_almost_equal(expected[var],
                                        datum[var],
                                        decimal=1)
     self.assertEqual(set(expected.keys()), set(datum.keys()))
     self.assertEqual(expected['term'], datum['term'])
Example #16
0
def produce_scattertext_html(
        term_doc_matrix,
        category,
        category_name,
        not_category_name,
        protocol='https',
        minimum_term_frequency=DEFAULT_MINIMUM_TERM_FREQUENCY,
        pmi_threshold_coefficient=DEFAULT_PMI_THRESHOLD_COEFFICIENT,
        max_terms=None,
        filter_unigrams=False,
        height_in_pixels=None,
        width_in_pixels=None,
        term_ranker=termranking.AbsoluteFrequencyRanker):
    '''Returns html code of visualization.

	Parameters
	----------
	term_doc_matrix : TermDocMatrix
		Corpus to use
	category : str
		name of category column
	category_name: str
		name of category to mine for
	not_category_name: str
		name of everything that isn't in category
	protocol : str
		optional, used prototcol of , http or https
	minimum_term_frequency : int, optional
		Minimum number of times word needs to appear to make it into visualization.
	pmi_threshold_coefficient : int, optional
		Filter out bigrams with a PMI of < 2 * pmi_threshold_coefficient. Default is 6.
	max_terms : int, optional
		Maximum number of terms to include in visualization.
	filter_unigrams : bool
		default False, do we filter unigrams that only occur in one bigram
	width_in_pixels: int
		width of viz in pixels, if None, default to JS's choice
	height_in_pixels: int
		height of viz in pixels, if None, default to JS's choice
	term_ranker : TermRanker
			TermRanker class for determining term frequency ranks.

	Returns
	-------
		str, html of visualization
	'''
    scatter_chart_data = ScatterChart(term_doc_matrix=term_doc_matrix,
                                      minimum_term_frequency=minimum_term_frequency,
                                      pmi_threshold_coefficient=pmi_threshold_coefficient,
                                      filter_unigrams=filter_unigrams,
                                      max_terms=max_terms,
                                      term_ranker=term_ranker) \
     .to_dict(category=category,
               category_name=category_name,
               not_category_name=not_category_name,
               transform=percentile_alphabetical)
    html = HTMLVisualizationAssembly(
        VizDataAdapter(scatter_chart_data), width_in_pixels,
        height_in_pixels).to_html(protocol=protocol)
    return html
Example #17
0
 def test_inject_coordinates_original(self):
     tdm = build_hamlet_jz_term_doc_mat()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     x = freq_df[freq_df.columns[1]].astype(np.float)
     y = freq_df[freq_df.columns[0]].astype(np.float)
     scatter_chart.inject_coordinates(x / x.max(), y / y.max(), original_x=x, original_y=y)
     j = scatter_chart.to_dict('hamlet')
     self.assertEqual(j['data'][0].keys(),
                      {'x', 'os', 'y', 'ncat25k', 'neut', 'cat25k', 'ox', 'neut25k', 'extra25k', 'extra', 'oy',
                       'term',
                       's', 'bg'})
     and_term = [t for t in j['data'] if t['term'] == 'and'][0]
     self.assertEqual(and_term['ox'], 0)
     self.assertEqual(and_term['oy'], 1)
Example #18
0
 def test_p_vals(self):
     tdm = build_hamlet_jz_term_doc_mat()
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0,
                       term_significance=LogOddsRatioUninformativeDirichletPrior())
          .to_dict('hamlet'))
     datum = self._get_data_example(j)
     self.assertIn('p', datum.keys())
Example #19
0
 def test_terms_to_include(self):
     tdm = build_hamlet_jz_term_doc_mat()
     terms_to_include = list(sorted(['both worlds', 'thou', 'the', 'of', 'st', 'returned', 'best', ]))
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0,
                       terms_to_include=terms_to_include)
          .to_dict('hamlet', 'HAMLET', 'NOT HAMLET'))
     self.assertEqual(list(sorted(t['term'] for t in j['data'])), terms_to_include)
Example #20
0
    def test_to_dict_without_categories(self):
        tdm = get_term_doc_matrix_without_categories()
        scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                     minimum_term_frequency=0)

        with self.assertRaises(NeedToInjectCoordinatesException):
            scatter_chart.to_dict_without_categories()
        x_coords = tdm.get_term_doc_mat().sum(axis=0).A1
        y_coords = tdm.get_term_doc_mat().astype(bool).astype(int).sum(
            axis=0).A1
        scatter_chart.inject_coordinates(original_x=x_coords,
                                         original_y=y_coords,
                                         x_coords=scale(x_coords),
                                         y_coords=scale(y_coords))
        j = scatter_chart.to_dict_without_categories()
        self.assertIsInstance(j, dict)
        self.assertEqual(set(j.keys()), set(['data']))
        self.assertEqual(len(j['data']), tdm.get_num_terms())
        self.assertEqual(
            j['data'][-1], {
                'cat': 4,
                'cat25k': 735,
                'ox': 4,
                'oy': 3,
                'term': 'speak',
                'x': 1.0,
                'y': 1.0
            })
Example #21
0
def produce_scattertext_html(term_doc_matrix,
                             category,
                             category_name,
                             not_category_name,
                             protocol='https',
                             pmi_filter_thresold=2,
                             minimum_term_frequency=3,
                             max_terms=None,
                             filter_unigrams=False,
                             height_in_pixels=None,
                             width_in_pixels=None,
                             term_ranker=termranking.AbsoluteFrequencyRanker):
    '''Returns html code of visualization.

	Parameters
	----------
	term_doc_matrix : TermDocMatrix
		Corpus to use
	category : str
		name of category column
	category_name: str
		name of category to mine for
	not_category_name: str
		name of everything that isn't in category
	protocol : str
		optional, used prototcol of , http or https
	filter_unigrams : bool
		default False, do we filter unigrams that only occur in one bigram
	width_in_pixels: int
		width of viz in pixels, if None, default to JS's choice
	height_in_pixels: int
		height of viz in pixels, if None, default to JS's choice
	term_ranker : TermRanker
			TermRanker class for determining term frequency ranks.

	Returns
	-------
		str, html of visualization
	'''
    scatter_chart_data = ScatterChart(term_doc_matrix=term_doc_matrix,
                                      minimum_term_frequency=minimum_term_frequency,
                                      pmi_threshold_coefficient=pmi_filter_thresold,
                                      filter_unigrams=filter_unigrams,
                                      max_terms=max_terms,
                                      term_ranker=term_ranker) \
     .to_dict(category=category,
               category_name=category_name,
               not_category_name=not_category_name,
               transform=percentile_alphabetical)
    html = HTMLVisualizationAssembly(
        VizDataAdapter(scatter_chart_data), width_in_pixels,
        height_in_pixels).to_html(protocol=protocol)
    return html
Example #22
0
    def test_max_terms(self):
        tdm = build_hamlet_jz_term_doc_mat()
        # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError):
        #	ScatterChart(term_doc_matrix=tdm).to_dict('hamlet')
        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=2).to_dict('hamlet'))
        self.assertEqual(2, len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=10).to_dict('hamlet'))
        self.assertEqual(10, len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=10000).to_dict('hamlet'))
        self.assertEqual(51, len(j['data']))

        j = (ScatterChart(term_doc_matrix=tdm,
                          minimum_term_frequency=0,
                          max_terms=None).to_dict('hamlet'))
        self.assertEqual(51, len(j['data']))
 def test_inject_metadata_descriptions(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     with self.assertRaises(AssertionError):
         scatter_chart.inject_metadata_descriptions(3323)
     if (sys.version_info > (3, 0)):
         with self.assertRaisesRegex(
                 Exception,
                 'The following meta data terms are not present: blah'):
             scatter_chart.inject_metadata_descriptions(
                 {'blah': 'asjdkflasdjklfsadjk jsdkafsd'})
         with self.assertRaisesRegex(
                 Exception,
                 'The following meta data terms are not present: cat2'):
             scatter_chart.inject_metadata_descriptions({
                 'cat1': 'asjdkflasdjklfsadjk jsdkafsd',
                 'cat2': 'asdf'
             })
     assert scatter_chart == scatter_chart.inject_metadata_descriptions(
         {'cat1': 'asjdkflasdjklfsadjk jsdkafsd'})
     j = scatter_chart.to_dict('hamlet')
     self.assertEqual(set(j.keys()),
                      set(['info', 'data', 'metadescriptions']))
Example #24
0
 def test_score_transform(self):
     corpus = get_test_corpus()
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0)
     d1 = sc.to_dict('hamlet')
     sc = ScatterChart(term_doc_matrix=corpus,
                       minimum_term_frequency=0,
                       score_transform=lambda x: x)
     d2 = sc.to_dict('hamlet')
     assert sum([datum['s'] for datum in d1['data']]) != sum(
         [datum['s'] for datum in d2['data']])
Example #25
0
 def test_to_json_use_non_text_features(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError):
     #	ScatterChart(term_doc_matrix=tdm).to_dict('hamlet')
     j = (ScatterChart(term_doc_matrix=tdm,
                       minimum_term_frequency=0,
                       use_non_text_features=True).to_dict('hamlet'))
     self.assertEqual(set(j.keys()), set(['info', 'data']))
     self.assertEqual(
         set(j['info'].keys()),
         set([
             'not_category_name', 'category_name', 'category_terms',
             'not_category_terms', 'category_internal_name'
         ]))
     self.assertEqual(
         {t['term']
          for t in j['data']},
         {'cat6', 'cat4', 'cat9', 'cat5', 'cat1', 'cat3', 'cat2'})
     json.dumps(j)
Example #26
0
    def to_dict(self,
                category,
                category_name=None,
                not_category_name=None,
                scores=None,
                metadata=None,
                max_docs_per_category=None,
                transform=percentile_alphabetical):
        '''
		:param category: Category to annotate
		:param category_name: Name of category which will appear on web site.
		:param not_category_name: Name of non-category axis which will appear on web site.
		:param scores: Scores to use.  Default to Scaled F-Score.
		:param metadata: None or array-like.  List of metadata for each document.
		:param max_docs_per_category: None or int.  Maximum number of documents to store per category.
		:param transform: Defaults to percentile_lexicographic
		:return: dictionary {info: {category_name: ..., not_category_name},
												 docs: {'texts': [doc1text, ...],
												        'labels': [1, 0, ...],
												        'meta': ['<b>blah</b>', '<b>blah</b>']}
		                     data: {term:, x:frequency [0-1], y:frequency [0-1],
		                            s: score,
		                            bg: background score,
		                            as: association score,
		                            cat25k: freq per 25k in category,
		                            cat: count in category,
		                            ncat: count in non-category,
		                            catdocs: [docnum, ...],
		                            ncatdocs: [docnum, ...]
		                            ncat25k: freq per 25k in non-category}}
		'''
        j = ScatterChart.to_dict(self,
                                 category,
                                 category_name=category_name,
                                 not_category_name=not_category_name,
                                 scores=scores,
                                 transform=transform)
        docs_getter = self._make_docs_getter(max_docs_per_category)
        j['docs'] = self._get_docs_structure(docs_getter, metadata)
        return j
    def test_to_dict_without_categories(self):
        tdm = get_term_doc_matrix_without_categories()
        scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0)

        with self.assertRaises(NeedToInjectCoordinatesException):
            scatter_chart.to_dict_without_categories()
        x_coords = tdm.get_term_doc_mat().sum(axis=0).A1
        y_coords = tdm.get_term_doc_mat().astype(bool).astype(int).sum(axis=0).A1
        scatter_chart.inject_coordinates(original_x=x_coords,
                                         original_y=y_coords,
                                         x_coords=scale(x_coords),
                                         y_coords=scale(y_coords))
        j = scatter_chart.to_dict_without_categories()
        self.assertIsInstance(j, dict)
        self.assertEqual(set(j.keys()), set(['data']))
        self.assertEqual(len(j['data']), tdm.get_num_terms())
        self.assertEqual(j['data'][-1],
                         {'cat': 4, 'cat25k': 735, 'ox': 4, 'oy': 3,
                          'term': 'speak', 'x': 1.0, 'y': 1.0})
 def test_inject_coordinates(self):
     tdm = build_hamlet_jz_term_doc_mat()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates([], [])
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(freq_df[freq_df.columns[0]], [])
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates([], freq_df[freq_df.columns[0]])
     x = freq_df[freq_df.columns[1]].astype(np.float)
     y = freq_df[freq_df.columns[0]].astype(np.float)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x, y)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x, y / y.max())
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x / x.max(), y)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(-x / x.max(), -y / y.max())
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(-x / x.max(), y / y.max())
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x / x.max(), -y / y.max())
     scatter_chart.inject_coordinates(x / x.max(), y / y.max())
Example #29
0
 def _add_term_freq_to_json_df(self, json_df, term_freq_df, category):
     ScatterChart._add_term_freq_to_json_df(self, json_df, term_freq_df,
                                            category)
     json_df['cat'] = term_freq_df[category + ' freq'].astype(np.int)
     json_df['ncat'] = term_freq_df['not cat freq'].astype(np.int)
Example #30
0
    def to_dict(self,
                category,
                category_name=None,
                not_category_name=None,
                scores=None,
                metadata=None,
                max_docs_per_category=None,
                transform=percentile_alphabetical,
                alternative_text_field=None,
                title_case_names=False,
                not_categories=None,
                neutral_categories=None,
                extra_categories=None,
                neutral_category_name=None,
                extra_category_name=None,
                background_scorer=None,
                include_term_category_counts=False):
        '''

        Parameters
        ----------
        category : str
            Category to annotate.  Exact value of category.
        category_name : str, optional
            Name of category which will appear on web site. Default None is same as category.
        not_category_name : str, optional
            Name of ~category which will appear on web site. Default None is same as "not " + category.
        scores : np.array, optional
            Scores to use for coloring.  Defaults to None, or RankDifference scores
        metadata, None or array-like.
          List of metadata for each document.  Defaults to a list of blank strings.
        max_docs_per_category, None or int, optional
          Maximum number of documents to store per category.  Defaults to 4.
        transform : function, optional
            Function for ranking terms.  Defaults to scattertext.Scalers.percentile_lexicographic.
        alternative_text_field : str or None, optional
            Field in from dataframe used to make corpus to display in place of parsed text. Only
            can be used if corpus is a ParsedCorpus instance.
        title_case_names : bool, default False
          Should the program title-case the category and not-category names?
        not_categories : list, optional
            List of categories to use as "not category".  Defaults to all others.
        neutral_categories : list, optional
            List of categories to use as neutral.  Defaults [].
        extra_categories : list, optional
            List of categories to use as extra.  Defaults [].
        neutral_category_name : str
            "Neutral" by default. Only active if show_neutral is True.  Name of the neutra l
            column.
        extra_category_name : str
            "Extra" by default. Only active if show_neutral and show_extra are true. Name of the
            extra column.
        background_scorer : CharacteristicScorer, optional
            Used for bg scores
        include_term_category_counts : bool, default False
            Includes term-category counts in keyed off 'term-category-count'. If use_non_text_features,
            use metadata counts instead.

        Returns
        -------
        dictionary {info: {'category_name': full category name, ...},
                    docs: {'texts': [doc1text, ...],
                            'labels': [1, 0, ...],
                            'meta': ['<b>blah</b>', '<b>blah</b>']},

                    // if include_term_category_counts
                    termCounts: [term num -> [total occurrences, total documents, variance], ... for the number of categories]

                    data: {term:term,
                           x:frequency [0-1],
                           y:frequency [0-1],
                           s: score,
                           bg: background score,
                           as: association score,
                           cat25k: freq per 25k in category,
                           cat: count in category,
                           ncat: count in non-category,
                           catdocs: [docnum, ...],
                           ncatdocs: [docnum, ...]
                           ncat25k: freq per 25k in non-category}}
        '''
        json_data = ScatterChart.to_dict(self,
                                         category,
                                         category_name=category_name,
                                         not_category_name=not_category_name,
                                         scores=scores,
                                         transform=transform,
                                         title_case_names=title_case_names,
                                         not_categories=not_categories,
                                         neutral_categories=neutral_categories,
                                         extra_categories=extra_categories,
                                         background_scorer=background_scorer)
        docs_getter = self._make_docs_getter(max_docs_per_category,
                                             alternative_text_field)
        if neutral_category_name is None:
            neutral_category_name = 'Neutral'
        if extra_category_name is None:
            extra_category_name = 'Extra'
        json_data['docs'] = self._get_docs_structure(docs_getter, metadata)
        json_data['info']['neutral_category_name'] = neutral_category_name
        json_data['info']['extra_category_name'] = extra_category_name
        if include_term_category_counts:

            terms = np.array(
                [term_struct['term'] for term_struct in json_data['data']])
            json_data['termCounts'] = self._get_term_doc_counts(terms)
        return json_data
Example #31
0
    def test_inject_metadata_term_lists(self):
        tdm = build_hamlet_jz_term_doc_mat()
        scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                     minimum_term_frequency=0)
        with self.assertRaises(TermDocMatrixHasNoMetadataException):
            scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']})
        scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(),
                                     minimum_term_frequency=0,
                                     use_non_text_features=True)

        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'blash': [3, 1]})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({3: ['a', 'b']})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists(3)
        self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart)
        j = scatter_chart.to_dict('hamlet')
        self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists']))
        self.assertEqual(set(j['info'].keys()),
                         set(['not_category_name',
                              'category_name',
                              'category_terms',
                              'not_category_terms',
                              'category_internal_name',
                              'not_category_internal_names',
                              'extra_category_internal_names',
                              'neutral_category_internal_names',
                              'categories']))
 def test_resuse_is_disabled(self):
     corpus = get_test_corpus()
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0)
     sc.to_dict('hamlet')
     with self.assertRaises(Exception):
         sc.to_dict('hamlet')
Example #33
0
 def test_resuse_is_disabled(self):
     corpus = get_test_corpus()
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0)
     sc.to_dict('hamlet')
     with self.assertRaises(Exception):
         sc.to_dict('hamlet')
    def test_inject_metadata_term_lists(self):
        tdm = build_hamlet_jz_term_doc_mat()
        scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                     minimum_term_frequency=0)
        with self.assertRaises(TermDocMatrixHasNoMetadataException):
            scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']})
        scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(),
                                     minimum_term_frequency=0,
                                     use_non_text_features=True)

        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'blash': [3, 1]})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({3: ['a', 'b']})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists(3)
        self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart)
        j = scatter_chart.to_dict('hamlet')
        self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists']))
        self.assertEqual(set(j['info'].keys()),
                         set(['not_category_name',
                              'category_name',
                              'category_terms',
                              'not_category_terms',
                              'category_internal_name',
                              'not_category_internal_names',
                              'extra_category_internal_names',
                              'neutral_category_internal_names',
                              'categories']))
Example #35
0
 def test_inject_coordinates(self):
     tdm = build_hamlet_jz_term_doc_mat()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates([], [])
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(freq_df[freq_df.columns[0]], [])
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates([], freq_df[freq_df.columns[0]])
     x = freq_df[freq_df.columns[1]].astype(np.float)
     y = freq_df[freq_df.columns[0]].astype(np.float)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x, y)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x, y / y.max())
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x / x.max(), y)
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(-x / x.max(), -y / y.max())
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(-x / x.max(), y / y.max())
     with self.assertRaises(CoordinatesNotRightException):
         scatter_chart.inject_coordinates(x / x.max(), -y / y.max())
     scatter_chart.inject_coordinates(x / x.max(), y / y.max())
 def _add_term_freq_to_json_df(self, json_df, term_freq_df, category):
     ScatterChart._add_term_freq_to_json_df(self, json_df, term_freq_df, category)
     json_df['cat'] = term_freq_df[category + ' freq'].astype(np.int)
     json_df['ncat'] = term_freq_df['not cat freq'].astype(np.int)
     if self._term_metadata is not None:
         json_df['etc'] = term_freq_df['term'].apply(lambda term: self._term_metadata.get(term, {}))
    def to_dict(self,
                category,
                category_name=None,
                not_category_name=None,
                scores=None,
                metadata=None,
                max_docs_per_category=None,
                transform=percentile_alphabetical,
                alternative_text_field=None,
                title_case_names=False,
                not_categories=None,
                neutral_categories=None,
                extra_categories=None,
                neutral_category_name=None,
                extra_category_name=None,
                background_scorer=None,
                include_term_category_counts=False):
        '''

        Parameters
        ----------
        category : str
            Category to annotate.  Exact value of category.
        category_name : str, optional
            Name of category which will appear on web site. Default None is same as category.
        not_category_name : str, optional
            Name of ~category which will appear on web site. Default None is same as "not " + category.
        scores : np.array, optional
            Scores to use for coloring.  Defaults to None, or RankDifference scores
        metadata, None or array-like.
          List of metadata for each document.  Defaults to a list of blank strings.
        max_docs_per_category, None or int, optional
          Maximum number of documents to store per category.  Defaults to 4.
        transform : function, optional
            Function for ranking terms.  Defaults to scattertext.Scalers.percentile_lexicographic.
        alternative_text_field : str or None, optional
            Field in from dataframe used to make corpus to display in place of parsed text. Only
            can be used if corpus is a ParsedCorpus instance.
        title_case_names : bool, default False
          Should the program title-case the category and not-category names?
        not_categories : list, optional
            List of categories to use as "not category".  Defaults to all others.
        neutral_categories : list, optional
            List of categories to use as neutral.  Defaults [].
        extra_categories : list, optional
            List of categories to use as extra.  Defaults [].
        neutral_category_name : str
            "Neutral" by default. Only active if show_neutral is True.  Name of the neutra l
            column.
        extra_category_name : str
            "Extra" by default. Only active if show_neutral and show_extra are true. Name of the
            extra column.
        background_scorer : CharacteristicScorer, optional
            Used for bg scores
        include_term_category_counts : bool, default False
            Includes term-category counts in keyed off 'term-category-count'. If use_non_text_features,
            use metadata counts instead.

        Returns
        -------
        dictionary {info: {'category_name': full category name, ...},
                    docs: {'texts': [doc1text, ...],
                            'labels': [1, 0, ...],
                            'meta': ['<b>blah</b>', '<b>blah</b>']},

                    // if include_term_category_counts
                    termCounts: [term num -> [total occurrences, total documents, variance], ... for the number of categories]

                    data: {term:term,
                           x:frequency [0-1],
                           y:frequency [0-1],
                           s: score,
                           bg: background score,
                           as: association score,
                           cat25k: freq per 25k in category,
                           cat: count in category,
                           ncat: count in non-category,
                           catdocs: [docnum, ...],
                           ncatdocs: [docnum, ...]
                           ncat25k: freq per 25k in non-category}
                           etc: term specific dictionary (if inject_term_metadata is called and contains terms)}
        '''
        json_data = ScatterChart.to_dict(self,
                                         category,
                                         category_name=category_name,
                                         not_category_name=not_category_name,
                                         scores=scores,
                                         transform=transform,
                                         title_case_names=title_case_names,
                                         not_categories=not_categories,
                                         neutral_categories=neutral_categories,
                                         extra_categories=extra_categories,
                                         background_scorer=background_scorer)
        docs_getter = self._make_docs_getter(max_docs_per_category, alternative_text_field)
        if neutral_category_name is None:
            neutral_category_name = 'Neutral'
        if extra_category_name is None:
            extra_category_name = 'Extra'
        json_data['docs'] = self._get_docs_structure(docs_getter, metadata)
        json_data['info']['neutral_category_name'] = neutral_category_name
        json_data['info']['extra_category_name'] = extra_category_name
        if include_term_category_counts:
            terms = np.array([term_struct['term'] for term_struct in json_data['data']])
            json_data['termCounts'] = self._get_term_doc_counts(terms)
        return json_data