def test_score_transform(self): corpus = get_test_corpus() sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) d1 = sc.to_dict('hamlet') sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0, score_transform=lambda x:x) d2 = sc.to_dict('hamlet') assert sum([datum['s'] for datum in d1['data']]) != sum([datum['s'] for datum in d2['data']])
def test_score_transform(self): corpus = get_test_corpus() sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) d1 = sc.to_dict('hamlet') sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0, score_transform=lambda x: x) d2 = sc.to_dict('hamlet') assert sum([datum['s'] for datum in d1['data']]) != sum( [datum['s'] for datum in d2['data']])
def to_dict(self, category, category_name=None, not_category_name=None, scores=None, metadata=None, max_docs_per_category=None, transform=percentile_alphabetical, alternative_text_field=None, title_case_names=False): ''' Parameters ---------- category : str Category to annotate. Exact value of category. category_name : str, optional Name of category which will appear on web site. Default None is same as category. not_category_name : str, optional Name of ~category which will appear on web site. Default None is same as "not " + category. scores : np.array, optional Scores to use for coloring. Defaults to None, or np.array(self.term_doc_matrix.get_scaled_f_scores(category)) metadata, None or array-like. List of metadata for each document. Defaults to a list of blank strings. max_docs_per_category, None or int, optional Maximum number of documents to store per category. Defaults to 4. transform : function, optional Function for ranking terms. Defaults to scattertext.Scalers.percentile_lexicographic. alternative_text_field : str or None, optional Field in from dataframe used to make corpus to display in place of parsed text. Only can be used if corpus is a ParsedCorpus instance. title_case_names : bool, default False Should the program title-case the category and not-category names? Returns ------- dictionary {info: {category_name: ..., not_category_name}, docs: {'texts': [doc1text, ...], 'labels': [1, 0, ...], 'meta': ['<b>blah</b>', '<b>blah</b>']} data: {term:, x:frequency [0-1], y:frequency [0-1], s: score, bg: background score, as: association score, cat25k: freq per 25k in category, cat: count in category, ncat: count in non-category, catdocs: [docnum, ...], ncatdocs: [docnum, ...] ncat25k: freq per 25k in non-category}} ''' j = ScatterChart.to_dict(self, category, category_name=category_name, not_category_name=not_category_name, scores=scores, transform=transform, title_case_names=title_case_names) docs_getter = self._make_docs_getter(max_docs_per_category, alternative_text_field) j['docs'] = self._get_docs_structure(docs_getter, metadata) return j
def test_inject_metadata_term_lists(self): tdm = build_hamlet_jz_term_doc_mat() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(TermDocMatrixHasNoMetadataException): scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']}) scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(), minimum_term_frequency=0, use_non_text_features=True) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({'blash': [3, 1]}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({3: ['a', 'b']}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists(3) self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart) j = scatter_chart.to_dict('hamlet') self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists'])) self.assertEqual(set(j['info'].keys()), set(['not_category_name', 'category_name', 'category_terms', 'not_category_terms', 'category_internal_name', 'not_category_internal_names', 'extra_category_internal_names', 'neutral_category_internal_names', 'categories']))
def test_inject_metadata_term_lists(self): tdm = build_hamlet_jz_term_doc_mat() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(TermDocMatrixHasNoMetadataException): scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']}) scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(), minimum_term_frequency=0, use_non_text_features=True) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({'blash': [3, 1]}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({3: ['a', 'b']}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists(3) self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart) j = scatter_chart.to_dict('hamlet') self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists'])) self.assertEqual(set(j['info'].keys()), set(['not_category_name', 'category_name', 'category_terms', 'not_category_terms', 'category_internal_name', 'not_category_internal_names', 'extra_category_internal_names', 'neutral_category_internal_names', 'categories']))
def test_inject_term_colors(self): tdm = build_hamlet_jz_corpus_with_meta() freq_df = tdm.get_term_freq_df() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) scatter_chart.inject_term_colors({'t1': '00ffee'}) j = scatter_chart.to_dict('hamlet') self.assertIn('term_colors', j['info'])
def test_inject_term_colors(self): tdm = build_hamlet_jz_corpus_with_meta() freq_df = tdm.get_term_freq_df() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) scatter_chart.inject_term_colors({'t1': '00ffee'}) j = scatter_chart.to_dict('hamlet') self.assertIn('term_colors', j['info'])
def test_inject_metadata_descriptions(self): tdm = build_hamlet_jz_corpus_with_meta() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(AssertionError): scatter_chart.inject_metadata_descriptions(3323) if (sys.version_info > (3, 0)): ''' with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: blah'): scatter_chart.inject_metadata_descriptions({'blah': 'asjdkflasdjklfsadjk jsdkafsd'}) with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: cat2'): scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd', 'cat2': 'asdf'}) ''' assert scatter_chart == scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd'}) j = scatter_chart.to_dict('hamlet') self.assertEqual(set(j.keys()), set(['info', 'data', 'metadescriptions']))
def test_inject_metadata_descriptions(self): tdm = build_hamlet_jz_corpus_with_meta() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(AssertionError): scatter_chart.inject_metadata_descriptions(3323) if (sys.version_info > (3, 0)): ''' with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: blah'): scatter_chart.inject_metadata_descriptions({'blah': 'asjdkflasdjklfsadjk jsdkafsd'}) with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: cat2'): scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd', 'cat2': 'asdf'}) ''' assert scatter_chart == scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd'}) j = scatter_chart.to_dict('hamlet') self.assertEqual(set(j.keys()), set(['info', 'data', 'metadescriptions']))
def test_inject_coordinates_original(self): tdm = build_hamlet_jz_term_doc_mat() freq_df = tdm.get_term_freq_df() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) x = freq_df[freq_df.columns[1]].astype(np.float) y = freq_df[freq_df.columns[0]].astype(np.float) scatter_chart.inject_coordinates(x / x.max(), y / y.max(), original_x=x, original_y=y) j = scatter_chart.to_dict('hamlet') self.assertEqual(j['data'][0].keys(), {'x', 'os', 'y', 'ncat25k', 'neut', 'cat25k', 'ox', 'neut25k', 'extra25k', 'extra', 'oy', 'term', 's', 'bg'}) and_term = [t for t in j['data'] if t['term'] == 'and'][0] self.assertEqual(and_term['ox'], 0) self.assertEqual(and_term['oy'], 1)
def test_inject_coordinates_original(self): tdm = build_hamlet_jz_term_doc_mat() freq_df = tdm.get_term_freq_df() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) x = freq_df[freq_df.columns[1]].astype(np.float) y = freq_df[freq_df.columns[0]].astype(np.float) scatter_chart.inject_coordinates(x / x.max(), y / y.max(), original_x=x, original_y=y) j = scatter_chart.to_dict('hamlet') self.assertEqual(j['data'][0].keys(), {'x', 'os', 'y', 'ncat25k', 'neut', 'cat25k', 'ox', 'neut25k', 'extra25k', 'extra', 'oy', 'term', 's', 'bg'}) and_term = [t for t in j['data'] if t['term'] == 'and'][0] self.assertEqual(and_term['ox'], 0) self.assertEqual(and_term['oy'], 1)
def to_dict(self, category, category_name=None, not_category_name=None, scores=None, metadata=None, max_docs_per_category=None, transform=percentile_alphabetical): ''' :param category: Category to annotate :param category_name: Name of category which will appear on web site. :param not_category_name: Name of non-category axis which will appear on web site. :param scores: Scores to use. Default to Scaled F-Score. :param metadata: None or array-like. List of metadata for each document. :param max_docs_per_category: None or int. Maximum number of documents to store per category. :param transform: Defaults to percentile_lexicographic :return: dictionary {info: {category_name: ..., not_category_name}, docs: {'texts': [doc1text, ...], 'labels': [1, 0, ...], 'meta': ['<b>blah</b>', '<b>blah</b>']} data: {term:, x:frequency [0-1], y:frequency [0-1], s: score, bg: background score, as: association score, cat25k: freq per 25k in category, cat: count in category, ncat: count in non-category, catdocs: [docnum, ...], ncatdocs: [docnum, ...] ncat25k: freq per 25k in non-category}} ''' j = ScatterChart.to_dict(self, category, category_name=category_name, not_category_name=not_category_name, scores=scores, transform=transform) docs_getter = self._make_docs_getter(max_docs_per_category) j['docs'] = self._get_docs_structure(docs_getter, metadata) return j
def to_dict(self, category, category_name=None, not_category_name=None, scores=None, metadata=None, max_docs_per_category=None, transform=percentile_alphabetical, alternative_text_field=None, title_case_names=False, not_categories=None, neutral_categories=None, extra_categories=None, neutral_category_name=None, extra_category_name=None, background_scorer=None, include_term_category_counts=False): ''' Parameters ---------- category : str Category to annotate. Exact value of category. category_name : str, optional Name of category which will appear on web site. Default None is same as category. not_category_name : str, optional Name of ~category which will appear on web site. Default None is same as "not " + category. scores : np.array, optional Scores to use for coloring. Defaults to None, or RankDifference scores metadata, None or array-like. List of metadata for each document. Defaults to a list of blank strings. max_docs_per_category, None or int, optional Maximum number of documents to store per category. Defaults to 4. transform : function, optional Function for ranking terms. Defaults to scattertext.Scalers.percentile_lexicographic. alternative_text_field : str or None, optional Field in from dataframe used to make corpus to display in place of parsed text. Only can be used if corpus is a ParsedCorpus instance. title_case_names : bool, default False Should the program title-case the category and not-category names? not_categories : list, optional List of categories to use as "not category". Defaults to all others. neutral_categories : list, optional List of categories to use as neutral. Defaults []. extra_categories : list, optional List of categories to use as extra. Defaults []. neutral_category_name : str "Neutral" by default. Only active if show_neutral is True. Name of the neutra l column. extra_category_name : str "Extra" by default. Only active if show_neutral and show_extra are true. Name of the extra column. background_scorer : CharacteristicScorer, optional Used for bg scores include_term_category_counts : bool, default False Includes term-category counts in keyed off 'term-category-count'. If use_non_text_features, use metadata counts instead. Returns ------- dictionary {info: {'category_name': full category name, ...}, docs: {'texts': [doc1text, ...], 'labels': [1, 0, ...], 'meta': ['<b>blah</b>', '<b>blah</b>']}, // if include_term_category_counts termCounts: [term num -> [total occurrences, total documents, variance], ... for the number of categories] data: {term:term, x:frequency [0-1], y:frequency [0-1], s: score, bg: background score, as: association score, cat25k: freq per 25k in category, cat: count in category, ncat: count in non-category, catdocs: [docnum, ...], ncatdocs: [docnum, ...] ncat25k: freq per 25k in non-category}} ''' json_data = ScatterChart.to_dict(self, category, category_name=category_name, not_category_name=not_category_name, scores=scores, transform=transform, title_case_names=title_case_names, not_categories=not_categories, neutral_categories=neutral_categories, extra_categories=extra_categories, background_scorer=background_scorer) docs_getter = self._make_docs_getter(max_docs_per_category, alternative_text_field) if neutral_category_name is None: neutral_category_name = 'Neutral' if extra_category_name is None: extra_category_name = 'Extra' json_data['docs'] = self._get_docs_structure(docs_getter, metadata) json_data['info']['neutral_category_name'] = neutral_category_name json_data['info']['extra_category_name'] = extra_category_name if include_term_category_counts: terms = np.array( [term_struct['term'] for term_struct in json_data['data']]) json_data['termCounts'] = self._get_term_doc_counts(terms) return json_data
def test_resuse_is_disabled(self): corpus = get_test_corpus() sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) sc.to_dict('hamlet') with self.assertRaises(Exception): sc.to_dict('hamlet')
def to_dict(self, category, category_name=None, not_category_name=None, scores=None, metadata=None, max_docs_per_category=None, transform=percentile_alphabetical, alternative_text_field=None, title_case_names=False, not_categories=None, neutral_categories=None, extra_categories=None, neutral_category_name=None, extra_category_name=None, background_scorer=None, include_term_category_counts=False): ''' Parameters ---------- category : str Category to annotate. Exact value of category. category_name : str, optional Name of category which will appear on web site. Default None is same as category. not_category_name : str, optional Name of ~category which will appear on web site. Default None is same as "not " + category. scores : np.array, optional Scores to use for coloring. Defaults to None, or RankDifference scores metadata, None or array-like. List of metadata for each document. Defaults to a list of blank strings. max_docs_per_category, None or int, optional Maximum number of documents to store per category. Defaults to 4. transform : function, optional Function for ranking terms. Defaults to scattertext.Scalers.percentile_lexicographic. alternative_text_field : str or None, optional Field in from dataframe used to make corpus to display in place of parsed text. Only can be used if corpus is a ParsedCorpus instance. title_case_names : bool, default False Should the program title-case the category and not-category names? not_categories : list, optional List of categories to use as "not category". Defaults to all others. neutral_categories : list, optional List of categories to use as neutral. Defaults []. extra_categories : list, optional List of categories to use as extra. Defaults []. neutral_category_name : str "Neutral" by default. Only active if show_neutral is True. Name of the neutra l column. extra_category_name : str "Extra" by default. Only active if show_neutral and show_extra are true. Name of the extra column. background_scorer : CharacteristicScorer, optional Used for bg scores include_term_category_counts : bool, default False Includes term-category counts in keyed off 'term-category-count'. If use_non_text_features, use metadata counts instead. Returns ------- dictionary {info: {'category_name': full category name, ...}, docs: {'texts': [doc1text, ...], 'labels': [1, 0, ...], 'meta': ['<b>blah</b>', '<b>blah</b>']}, // if include_term_category_counts termCounts: [term num -> [total occurrences, total documents, variance], ... for the number of categories] data: {term:term, x:frequency [0-1], y:frequency [0-1], s: score, bg: background score, as: association score, cat25k: freq per 25k in category, cat: count in category, ncat: count in non-category, catdocs: [docnum, ...], ncatdocs: [docnum, ...] ncat25k: freq per 25k in non-category} etc: term specific dictionary (if inject_term_metadata is called and contains terms)} ''' json_data = ScatterChart.to_dict(self, category, category_name=category_name, not_category_name=not_category_name, scores=scores, transform=transform, title_case_names=title_case_names, not_categories=not_categories, neutral_categories=neutral_categories, extra_categories=extra_categories, background_scorer=background_scorer) docs_getter = self._make_docs_getter(max_docs_per_category, alternative_text_field) if neutral_category_name is None: neutral_category_name = 'Neutral' if extra_category_name is None: extra_category_name = 'Extra' json_data['docs'] = self._get_docs_structure(docs_getter, metadata) json_data['info']['neutral_category_name'] = neutral_category_name json_data['info']['extra_category_name'] = extra_category_name if include_term_category_counts: terms = np.array([term_struct['term'] for term_struct in json_data['data']]) json_data['termCounts'] = self._get_term_doc_counts(terms) return json_data
def test_resuse_is_disabled(self): corpus = get_test_corpus() sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) sc.to_dict('hamlet') with self.assertRaises(Exception): sc.to_dict('hamlet')