コード例 #1
0
 def test_score_transform(self):
     corpus = get_test_corpus()
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0)
     d1 = sc.to_dict('hamlet')
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0, score_transform=lambda x:x)
     d2 = sc.to_dict('hamlet')
     assert sum([datum['s'] for datum in d1['data']]) != sum([datum['s'] for datum in d2['data']])
コード例 #2
0
 def test_score_transform(self):
     corpus = get_test_corpus()
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0)
     d1 = sc.to_dict('hamlet')
     sc = ScatterChart(term_doc_matrix=corpus,
                       minimum_term_frequency=0,
                       score_transform=lambda x: x)
     d2 = sc.to_dict('hamlet')
     assert sum([datum['s'] for datum in d1['data']]) != sum(
         [datum['s'] for datum in d2['data']])
コード例 #3
0
	def to_dict(self,
	            category,
	            category_name=None,
	            not_category_name=None,
	            scores=None,
	            metadata=None,
	            max_docs_per_category=None,
	            transform=percentile_alphabetical,
	            alternative_text_field=None,
	            title_case_names=False):
		'''

		Parameters
		----------
		category : str
			Category to annotate.  Exact value of category.
		category_name : str, optional
			Name of category which will appear on web site. Default None is same as category.
		not_category_name : str, optional
			Name of ~category which will appear on web site. Default None is same as "not " + category.
		scores : np.array, optional
			Scores to use for coloring.  Defaults to None, or np.array(self.term_doc_matrix.get_scaled_f_scores(category))
		metadata, None or array-like.
		  List of metadata for each document.  Defaults to a list of blank strings.
		max_docs_per_category, None or int, optional
		  Maximum number of documents to store per category.  Defaults to 4.
		transform : function, optional
			Function for ranking terms.  Defaults to scattertext.Scalers.percentile_lexicographic.
		alternative_text_field : str or None, optional
			Field in from dataframe used to make corpus to display in place of parsed text. Only
			can be used if corpus is a ParsedCorpus instance.
		title_case_names : bool, default False
		  Should the program title-case the category and not-category names?
		Returns
		-------
		dictionary {info: {category_name: ..., not_category_name},
												 docs: {'texts': [doc1text, ...],
												        'labels': [1, 0, ...],
												        'meta': ['<b>blah</b>', '<b>blah</b>']}
		                     data: {term:, x:frequency [0-1], y:frequency [0-1],
		                            s: score,
		                            bg: background score,
		                            as: association score,
		                            cat25k: freq per 25k in category,
		                            cat: count in category,
		                            ncat: count in non-category,
		                            catdocs: [docnum, ...],
		                            ncatdocs: [docnum, ...]
		                            ncat25k: freq per 25k in non-category}}
		'''
		j = ScatterChart.to_dict(self,
		                         category,
		                         category_name=category_name,
		                         not_category_name=not_category_name,
		                         scores=scores,
		                         transform=transform,
		                         title_case_names=title_case_names)
		docs_getter = self._make_docs_getter(max_docs_per_category, alternative_text_field)
		j['docs'] = self._get_docs_structure(docs_getter, metadata)
		return j
コード例 #4
0
    def test_inject_metadata_term_lists(self):
        tdm = build_hamlet_jz_term_doc_mat()
        scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                     minimum_term_frequency=0)
        with self.assertRaises(TermDocMatrixHasNoMetadataException):
            scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']})
        scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(),
                                     minimum_term_frequency=0,
                                     use_non_text_features=True)

        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'blash': [3, 1]})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({3: ['a', 'b']})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists(3)
        self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart)
        j = scatter_chart.to_dict('hamlet')
        self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists']))
        self.assertEqual(set(j['info'].keys()),
                         set(['not_category_name',
                              'category_name',
                              'category_terms',
                              'not_category_terms',
                              'category_internal_name',
                              'not_category_internal_names',
                              'extra_category_internal_names',
                              'neutral_category_internal_names',
                              'categories']))
コード例 #5
0
    def test_inject_metadata_term_lists(self):
        tdm = build_hamlet_jz_term_doc_mat()
        scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                     minimum_term_frequency=0)
        with self.assertRaises(TermDocMatrixHasNoMetadataException):
            scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']})
        scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(),
                                     minimum_term_frequency=0,
                                     use_non_text_features=True)

        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'blash': [3, 1]})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({3: ['a', 'b']})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}})
        with self.assertRaises(TypeError):
            scatter_chart.inject_metadata_term_lists(3)
        self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart)
        j = scatter_chart.to_dict('hamlet')
        self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists']))
        self.assertEqual(set(j['info'].keys()),
                         set(['not_category_name',
                              'category_name',
                              'category_terms',
                              'not_category_terms',
                              'category_internal_name',
                              'not_category_internal_names',
                              'extra_category_internal_names',
                              'neutral_category_internal_names',
                              'categories']))
コード例 #6
0
 def test_inject_term_colors(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     scatter_chart.inject_term_colors({'t1': '00ffee'})
     j = scatter_chart.to_dict('hamlet')
     self.assertIn('term_colors', j['info'])
コード例 #7
0
 def test_inject_term_colors(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     scatter_chart.inject_term_colors({'t1': '00ffee'})
     j = scatter_chart.to_dict('hamlet')
     self.assertIn('term_colors', j['info'])
コード例 #8
0
 def test_inject_metadata_descriptions(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0)
     with self.assertRaises(AssertionError):
         scatter_chart.inject_metadata_descriptions(3323)
     if (sys.version_info > (3, 0)):
         '''
         with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: blah'):
             scatter_chart.inject_metadata_descriptions({'blah': 'asjdkflasdjklfsadjk jsdkafsd'})
         with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: cat2'):
             scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd', 'cat2': 'asdf'})
         '''
     assert scatter_chart == scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd'})
     j = scatter_chart.to_dict('hamlet')
     self.assertEqual(set(j.keys()), set(['info', 'data', 'metadescriptions']))
コード例 #9
0
 def test_inject_metadata_descriptions(self):
     tdm = build_hamlet_jz_corpus_with_meta()
     scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0)
     with self.assertRaises(AssertionError):
         scatter_chart.inject_metadata_descriptions(3323)
     if (sys.version_info > (3, 0)):
         '''
         with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: blah'):
             scatter_chart.inject_metadata_descriptions({'blah': 'asjdkflasdjklfsadjk jsdkafsd'})
         with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: cat2'):
             scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd', 'cat2': 'asdf'})
         '''
     assert scatter_chart == scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd'})
     j = scatter_chart.to_dict('hamlet')
     self.assertEqual(set(j.keys()), set(['info', 'data', 'metadescriptions']))
コード例 #10
0
 def test_inject_coordinates_original(self):
     tdm = build_hamlet_jz_term_doc_mat()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     x = freq_df[freq_df.columns[1]].astype(np.float)
     y = freq_df[freq_df.columns[0]].astype(np.float)
     scatter_chart.inject_coordinates(x / x.max(), y / y.max(), original_x=x, original_y=y)
     j = scatter_chart.to_dict('hamlet')
     self.assertEqual(j['data'][0].keys(),
                      {'x', 'os', 'y', 'ncat25k', 'neut', 'cat25k', 'ox', 'neut25k', 'extra25k', 'extra', 'oy',
                       'term',
                       's', 'bg'})
     and_term = [t for t in j['data'] if t['term'] == 'and'][0]
     self.assertEqual(and_term['ox'], 0)
     self.assertEqual(and_term['oy'], 1)
コード例 #11
0
 def test_inject_coordinates_original(self):
     tdm = build_hamlet_jz_term_doc_mat()
     freq_df = tdm.get_term_freq_df()
     scatter_chart = ScatterChart(term_doc_matrix=tdm,
                                  minimum_term_frequency=0)
     x = freq_df[freq_df.columns[1]].astype(np.float)
     y = freq_df[freq_df.columns[0]].astype(np.float)
     scatter_chart.inject_coordinates(x / x.max(), y / y.max(), original_x=x, original_y=y)
     j = scatter_chart.to_dict('hamlet')
     self.assertEqual(j['data'][0].keys(),
                      {'x', 'os', 'y', 'ncat25k', 'neut', 'cat25k', 'ox', 'neut25k', 'extra25k', 'extra', 'oy',
                       'term',
                       's', 'bg'})
     and_term = [t for t in j['data'] if t['term'] == 'and'][0]
     self.assertEqual(and_term['ox'], 0)
     self.assertEqual(and_term['oy'], 1)
コード例 #12
0
    def to_dict(self,
                category,
                category_name=None,
                not_category_name=None,
                scores=None,
                metadata=None,
                max_docs_per_category=None,
                transform=percentile_alphabetical):
        '''
		:param category: Category to annotate
		:param category_name: Name of category which will appear on web site.
		:param not_category_name: Name of non-category axis which will appear on web site.
		:param scores: Scores to use.  Default to Scaled F-Score.
		:param metadata: None or array-like.  List of metadata for each document.
		:param max_docs_per_category: None or int.  Maximum number of documents to store per category.
		:param transform: Defaults to percentile_lexicographic
		:return: dictionary {info: {category_name: ..., not_category_name},
												 docs: {'texts': [doc1text, ...],
												        'labels': [1, 0, ...],
												        'meta': ['<b>blah</b>', '<b>blah</b>']}
		                     data: {term:, x:frequency [0-1], y:frequency [0-1],
		                            s: score,
		                            bg: background score,
		                            as: association score,
		                            cat25k: freq per 25k in category,
		                            cat: count in category,
		                            ncat: count in non-category,
		                            catdocs: [docnum, ...],
		                            ncatdocs: [docnum, ...]
		                            ncat25k: freq per 25k in non-category}}
		'''
        j = ScatterChart.to_dict(self,
                                 category,
                                 category_name=category_name,
                                 not_category_name=not_category_name,
                                 scores=scores,
                                 transform=transform)
        docs_getter = self._make_docs_getter(max_docs_per_category)
        j['docs'] = self._get_docs_structure(docs_getter, metadata)
        return j
コード例 #13
0
    def to_dict(self,
                category,
                category_name=None,
                not_category_name=None,
                scores=None,
                metadata=None,
                max_docs_per_category=None,
                transform=percentile_alphabetical,
                alternative_text_field=None,
                title_case_names=False,
                not_categories=None,
                neutral_categories=None,
                extra_categories=None,
                neutral_category_name=None,
                extra_category_name=None,
                background_scorer=None,
                include_term_category_counts=False):
        '''

        Parameters
        ----------
        category : str
            Category to annotate.  Exact value of category.
        category_name : str, optional
            Name of category which will appear on web site. Default None is same as category.
        not_category_name : str, optional
            Name of ~category which will appear on web site. Default None is same as "not " + category.
        scores : np.array, optional
            Scores to use for coloring.  Defaults to None, or RankDifference scores
        metadata, None or array-like.
          List of metadata for each document.  Defaults to a list of blank strings.
        max_docs_per_category, None or int, optional
          Maximum number of documents to store per category.  Defaults to 4.
        transform : function, optional
            Function for ranking terms.  Defaults to scattertext.Scalers.percentile_lexicographic.
        alternative_text_field : str or None, optional
            Field in from dataframe used to make corpus to display in place of parsed text. Only
            can be used if corpus is a ParsedCorpus instance.
        title_case_names : bool, default False
          Should the program title-case the category and not-category names?
        not_categories : list, optional
            List of categories to use as "not category".  Defaults to all others.
        neutral_categories : list, optional
            List of categories to use as neutral.  Defaults [].
        extra_categories : list, optional
            List of categories to use as extra.  Defaults [].
        neutral_category_name : str
            "Neutral" by default. Only active if show_neutral is True.  Name of the neutra l
            column.
        extra_category_name : str
            "Extra" by default. Only active if show_neutral and show_extra are true. Name of the
            extra column.
        background_scorer : CharacteristicScorer, optional
            Used for bg scores
        include_term_category_counts : bool, default False
            Includes term-category counts in keyed off 'term-category-count'. If use_non_text_features,
            use metadata counts instead.

        Returns
        -------
        dictionary {info: {'category_name': full category name, ...},
                    docs: {'texts': [doc1text, ...],
                            'labels': [1, 0, ...],
                            'meta': ['<b>blah</b>', '<b>blah</b>']},

                    // if include_term_category_counts
                    termCounts: [term num -> [total occurrences, total documents, variance], ... for the number of categories]

                    data: {term:term,
                           x:frequency [0-1],
                           y:frequency [0-1],
                           s: score,
                           bg: background score,
                           as: association score,
                           cat25k: freq per 25k in category,
                           cat: count in category,
                           ncat: count in non-category,
                           catdocs: [docnum, ...],
                           ncatdocs: [docnum, ...]
                           ncat25k: freq per 25k in non-category}}
        '''
        json_data = ScatterChart.to_dict(self,
                                         category,
                                         category_name=category_name,
                                         not_category_name=not_category_name,
                                         scores=scores,
                                         transform=transform,
                                         title_case_names=title_case_names,
                                         not_categories=not_categories,
                                         neutral_categories=neutral_categories,
                                         extra_categories=extra_categories,
                                         background_scorer=background_scorer)
        docs_getter = self._make_docs_getter(max_docs_per_category,
                                             alternative_text_field)
        if neutral_category_name is None:
            neutral_category_name = 'Neutral'
        if extra_category_name is None:
            extra_category_name = 'Extra'
        json_data['docs'] = self._get_docs_structure(docs_getter, metadata)
        json_data['info']['neutral_category_name'] = neutral_category_name
        json_data['info']['extra_category_name'] = extra_category_name
        if include_term_category_counts:

            terms = np.array(
                [term_struct['term'] for term_struct in json_data['data']])
            json_data['termCounts'] = self._get_term_doc_counts(terms)
        return json_data
コード例 #14
0
 def test_resuse_is_disabled(self):
     corpus = get_test_corpus()
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0)
     sc.to_dict('hamlet')
     with self.assertRaises(Exception):
         sc.to_dict('hamlet')
コード例 #15
0
    def to_dict(self,
                category,
                category_name=None,
                not_category_name=None,
                scores=None,
                metadata=None,
                max_docs_per_category=None,
                transform=percentile_alphabetical,
                alternative_text_field=None,
                title_case_names=False,
                not_categories=None,
                neutral_categories=None,
                extra_categories=None,
                neutral_category_name=None,
                extra_category_name=None,
                background_scorer=None,
                include_term_category_counts=False):
        '''

        Parameters
        ----------
        category : str
            Category to annotate.  Exact value of category.
        category_name : str, optional
            Name of category which will appear on web site. Default None is same as category.
        not_category_name : str, optional
            Name of ~category which will appear on web site. Default None is same as "not " + category.
        scores : np.array, optional
            Scores to use for coloring.  Defaults to None, or RankDifference scores
        metadata, None or array-like.
          List of metadata for each document.  Defaults to a list of blank strings.
        max_docs_per_category, None or int, optional
          Maximum number of documents to store per category.  Defaults to 4.
        transform : function, optional
            Function for ranking terms.  Defaults to scattertext.Scalers.percentile_lexicographic.
        alternative_text_field : str or None, optional
            Field in from dataframe used to make corpus to display in place of parsed text. Only
            can be used if corpus is a ParsedCorpus instance.
        title_case_names : bool, default False
          Should the program title-case the category and not-category names?
        not_categories : list, optional
            List of categories to use as "not category".  Defaults to all others.
        neutral_categories : list, optional
            List of categories to use as neutral.  Defaults [].
        extra_categories : list, optional
            List of categories to use as extra.  Defaults [].
        neutral_category_name : str
            "Neutral" by default. Only active if show_neutral is True.  Name of the neutra l
            column.
        extra_category_name : str
            "Extra" by default. Only active if show_neutral and show_extra are true. Name of the
            extra column.
        background_scorer : CharacteristicScorer, optional
            Used for bg scores
        include_term_category_counts : bool, default False
            Includes term-category counts in keyed off 'term-category-count'. If use_non_text_features,
            use metadata counts instead.

        Returns
        -------
        dictionary {info: {'category_name': full category name, ...},
                    docs: {'texts': [doc1text, ...],
                            'labels': [1, 0, ...],
                            'meta': ['<b>blah</b>', '<b>blah</b>']},

                    // if include_term_category_counts
                    termCounts: [term num -> [total occurrences, total documents, variance], ... for the number of categories]

                    data: {term:term,
                           x:frequency [0-1],
                           y:frequency [0-1],
                           s: score,
                           bg: background score,
                           as: association score,
                           cat25k: freq per 25k in category,
                           cat: count in category,
                           ncat: count in non-category,
                           catdocs: [docnum, ...],
                           ncatdocs: [docnum, ...]
                           ncat25k: freq per 25k in non-category}
                           etc: term specific dictionary (if inject_term_metadata is called and contains terms)}
        '''
        json_data = ScatterChart.to_dict(self,
                                         category,
                                         category_name=category_name,
                                         not_category_name=not_category_name,
                                         scores=scores,
                                         transform=transform,
                                         title_case_names=title_case_names,
                                         not_categories=not_categories,
                                         neutral_categories=neutral_categories,
                                         extra_categories=extra_categories,
                                         background_scorer=background_scorer)
        docs_getter = self._make_docs_getter(max_docs_per_category, alternative_text_field)
        if neutral_category_name is None:
            neutral_category_name = 'Neutral'
        if extra_category_name is None:
            extra_category_name = 'Extra'
        json_data['docs'] = self._get_docs_structure(docs_getter, metadata)
        json_data['info']['neutral_category_name'] = neutral_category_name
        json_data['info']['extra_category_name'] = extra_category_name
        if include_term_category_counts:
            terms = np.array([term_struct['term'] for term_struct in json_data['data']])
            json_data['termCounts'] = self._get_term_doc_counts(terms)
        return json_data
コード例 #16
0
 def test_resuse_is_disabled(self):
     corpus = get_test_corpus()
     sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0)
     sc.to_dict('hamlet')
     with self.assertRaises(Exception):
         sc.to_dict('hamlet')