def test_max_terms(self): tdm = build_hamlet_jz_term_doc_mat() j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, max_terms=2) .to_dict('hamlet')) self.assertEqual(2, len(j['data'])) j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, max_terms=10) .to_dict('hamlet')) self.assertEqual(10, len(j['data'])) j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, pmi_threshold_coefficient=0, max_terms=10000) .to_dict('hamlet')) self.assertEqual(len(tdm.get_term_freq_df()), len(j['data'])) j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, pmi_threshold_coefficient=0, max_terms=None) .to_dict('hamlet')) self.assertEqual(len(tdm.get_term_freq_df()), len(j['data']))
def test_score_transform(self): corpus = get_test_corpus() sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) d1 = sc.to_dict('hamlet') sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0, score_transform=lambda x:x) d2 = sc.to_dict('hamlet') assert sum([datum['s'] for datum in d1['data']]) != sum([datum['s'] for datum in d2['data']])
def __init__(self, corpus, **kwargs): '''See ScatterChart. This lets you click on terms to see what contexts they tend to appear in. Running the `to_dict` function outputs ''' assert (isinstance(corpus, Corpus)) or (isinstance( corpus, TermCategoryFrequencies)) ScatterChart.__init__(self, corpus, **kwargs)
def test_inject_term_colors(self): tdm = build_hamlet_jz_corpus_with_meta() freq_df = tdm.get_term_freq_df() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) scatter_chart.inject_term_colors({'t1': '00ffee'}) j = scatter_chart.to_dict('hamlet') self.assertIn('term_colors', j['info'])
def __init__(self, corpus, **kwargs): '''See ScatterChart. This lets you click on terms to see what contexts they tend to appear in. ''' assert isinstance(corpus, Corpus) ScatterChart.__init__(self, corpus, **kwargs)
def test_inject_term_colors(self): tdm = build_hamlet_jz_corpus_with_meta() freq_df = tdm.get_term_freq_df() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) scatter_chart.inject_term_colors({'t1': '00ffee'}) j = scatter_chart.to_dict('hamlet') self.assertIn('term_colors', j['info'])
def _add_term_freq_to_json_df(self, json_df, term_freq_df, category): ScatterChart._add_term_freq_to_json_df(self, json_df, term_freq_df, category) json_df['cat'] = term_freq_df[category + ' freq'].astype(np.int) json_df['ncat'] = term_freq_df['not cat freq'].astype(np.int) if self._term_metadata is not None: json_df['etc'] = term_freq_df['term'].apply( lambda term: self._term_metadata.get(term, {}))
def test_multi_categories(self): corpus = get_test_corpus() j_vs_all = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) \ .to_dict('hamlet') j_vs_swift = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) \ .to_dict('hamlet', not_categories=['swift']) self.assertNotEqual(set(j_vs_all['info']['not_category_internal_names']), set(j_vs_swift['info']['not_category_internal_names'])) self.assertEqual(j_vs_all['info']['categories'], corpus.get_categories()) self.assertEqual(j_vs_swift['info']['categories'], corpus.get_categories())
def __init__(self, corpus, verbose=False, **kwargs): '''See ScatterChart. This lets you click on terms to see what contexts they tend to appear in. Running the `to_dict` function outputs ''' # if not (isinstance(corpus, (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies)) # or (issubclass(type(corpus), (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies)))): # raise AssertionError(corpus, 'of type', type(corpus), # 'must be a subclass of Corpus or TermCategoryFrequencies.') ScatterChart.__init__(self, corpus, verbose, **kwargs) self._term_metadata = None
def __init__(self, corpus, **kwargs): '''See ScatterChart. This lets you click on terms to see what contexts they tend to appear in. Running the `to_dict` function outputs ''' #if not (isinstance(corpus, (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies)) # or (issubclass(type(corpus), (Corpus, ParsedCorpus, CorpusDF, TermCategoryFrequencies)))): # raise AssertionError(corpus, 'of type', type(corpus), # 'must be a subclass of Corpus or TermCategoryFrequencies.') ScatterChart.__init__(self, corpus, **kwargs) self._term_metadata = None
def test_title_case_names(self): tdm = build_hamlet_jz_term_doc_mat() j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) .to_dict('hamlet', 'HAMLET', 'NOT HAMLET')) self.assertEqual(j['info']['category_name'], 'HAMLET') self.assertEqual(j['info']['not_category_name'], 'NOT HAMLET') tdm = build_hamlet_jz_term_doc_mat() j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) .to_dict('hamlet', 'HAMLET', 'NOT HAMLET', title_case_names=True)) self.assertEqual(j['info']['category_name'], 'Hamlet') self.assertEqual(j['info']['not_category_name'], 'Not Hamlet')
def test_inject_metadata_descriptions(self): tdm = build_hamlet_jz_corpus_with_meta() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(AssertionError): scatter_chart.inject_metadata_descriptions(3323) if (sys.version_info > (3, 0)): ''' with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: blah'): scatter_chart.inject_metadata_descriptions({'blah': 'asjdkflasdjklfsadjk jsdkafsd'}) with self.assertRaisesRegex(Exception, 'The following meta data terms are not present: cat2'): scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd', 'cat2': 'asdf'}) ''' assert scatter_chart == scatter_chart.inject_metadata_descriptions({'cat1': 'asjdkflasdjklfsadjk jsdkafsd'}) j = scatter_chart.to_dict('hamlet') self.assertEqual(set(j.keys()), set(['info', 'data', 'metadescriptions']))
def test_inject_coordinates_original(self): tdm = build_hamlet_jz_term_doc_mat() freq_df = tdm.get_term_freq_df() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) x = freq_df[freq_df.columns[1]].astype(np.float) y = freq_df[freq_df.columns[0]].astype(np.float) scatter_chart.inject_coordinates(x / x.max(), y / y.max(), original_x=x, original_y=y) j = scatter_chart.to_dict('hamlet') self.assertEqual(j['data'][0].keys(), {'x', 'os', 'y', 'ncat25k', 'neut', 'cat25k', 'ox', 'neut25k', 'extra25k', 'extra', 'oy', 'term', 's', 'bg'}) and_term = [t for t in j['data'] if t['term'] == 'and'][0] self.assertEqual(and_term['ox'], 0) self.assertEqual(and_term['oy'], 1)
def to_dict(self, category, category_name=None, not_category_name=None, scores=None, metadata=None, max_docs_per_category=None, transform=percentile_alphabetical, alternative_text_field=None, title_case_names=False): ''' Parameters ---------- category : str Category to annotate. Exact value of category. category_name : str, optional Name of category which will appear on web site. Default None is same as category. not_category_name : str, optional Name of ~category which will appear on web site. Default None is same as "not " + category. scores : np.array, optional Scores to use for coloring. Defaults to None, or np.array(self.term_doc_matrix.get_scaled_f_scores(category)) metadata, None or array-like. List of metadata for each document. Defaults to a list of blank strings. max_docs_per_category, None or int, optional Maximum number of documents to store per category. Defaults to 4. transform : function, optional Function for ranking terms. Defaults to scattertext.Scalers.percentile_lexicographic. alternative_text_field : str or None, optional Field in from dataframe used to make corpus to display in place of parsed text. Only can be used if corpus is a ParsedCorpus instance. title_case_names : bool, default False Should the program title-case the category and not-category names? Returns ------- dictionary {info: {category_name: ..., not_category_name}, docs: {'texts': [doc1text, ...], 'labels': [1, 0, ...], 'meta': ['<b>blah</b>', '<b>blah</b>']} data: {term:, x:frequency [0-1], y:frequency [0-1], s: score, bg: background score, as: association score, cat25k: freq per 25k in category, cat: count in category, ncat: count in non-category, catdocs: [docnum, ...], ncatdocs: [docnum, ...] ncat25k: freq per 25k in non-category}} ''' j = ScatterChart.to_dict(self, category, category_name=category_name, not_category_name=not_category_name, scores=scores, transform=transform, title_case_names=title_case_names) docs_getter = self._make_docs_getter(max_docs_per_category, alternative_text_field) j['docs'] = self._get_docs_structure(docs_getter, metadata) return j
def test_to_json(self): tdm = build_hamlet_jz_term_doc_mat() # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError): # ScatterChart(term_doc_matrix=tdm).to_dict('hamlet') j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0).to_dict('hamlet')) self.assertEqual(set(j.keys()), set(['info', 'data'])) self.assertEqual( set(j['info'].keys()), set([ 'not_category_name', 'category_name', 'category_terms', 'not_category_terms', 'category_internal_name', 'not_category_internal_names', 'categories' ])) expected = { "x": 0.0, "y": 0.42, 'ox': 0, 'oy': 0.42, "term": "art", "cat25k": 758, "ncat25k": 0, "neut25k": 0, 'neut': 0, 's': 0.5, 'os': 3, 'bg': 3 } datum = self._get_data_example(j) for var in ['cat25k', 'ncat25k']: np.testing.assert_almost_equal(expected[var], datum[var], decimal=1) self.assertEqual(set(expected.keys()), set(datum.keys())) self.assertEqual(expected['term'], datum['term'])
def produce_scattertext_html( term_doc_matrix, category, category_name, not_category_name, protocol='https', minimum_term_frequency=DEFAULT_MINIMUM_TERM_FREQUENCY, pmi_threshold_coefficient=DEFAULT_PMI_THRESHOLD_COEFFICIENT, max_terms=None, filter_unigrams=False, height_in_pixels=None, width_in_pixels=None, term_ranker=termranking.AbsoluteFrequencyRanker): '''Returns html code of visualization. Parameters ---------- term_doc_matrix : TermDocMatrix Corpus to use category : str name of category column category_name: str name of category to mine for not_category_name: str name of everything that isn't in category protocol : str optional, used prototcol of , http or https minimum_term_frequency : int, optional Minimum number of times word needs to appear to make it into visualization. pmi_threshold_coefficient : int, optional Filter out bigrams with a PMI of < 2 * pmi_threshold_coefficient. Default is 6. max_terms : int, optional Maximum number of terms to include in visualization. filter_unigrams : bool default False, do we filter unigrams that only occur in one bigram width_in_pixels: int width of viz in pixels, if None, default to JS's choice height_in_pixels: int height of viz in pixels, if None, default to JS's choice term_ranker : TermRanker TermRanker class for determining term frequency ranks. Returns ------- str, html of visualization ''' scatter_chart_data = ScatterChart(term_doc_matrix=term_doc_matrix, minimum_term_frequency=minimum_term_frequency, pmi_threshold_coefficient=pmi_threshold_coefficient, filter_unigrams=filter_unigrams, max_terms=max_terms, term_ranker=term_ranker) \ .to_dict(category=category, category_name=category_name, not_category_name=not_category_name, transform=percentile_alphabetical) html = HTMLVisualizationAssembly( VizDataAdapter(scatter_chart_data), width_in_pixels, height_in_pixels).to_html(protocol=protocol) return html
def test_inject_coordinates_original(self): tdm = build_hamlet_jz_term_doc_mat() freq_df = tdm.get_term_freq_df() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) x = freq_df[freq_df.columns[1]].astype(np.float) y = freq_df[freq_df.columns[0]].astype(np.float) scatter_chart.inject_coordinates(x / x.max(), y / y.max(), original_x=x, original_y=y) j = scatter_chart.to_dict('hamlet') self.assertEqual(j['data'][0].keys(), {'x', 'os', 'y', 'ncat25k', 'neut', 'cat25k', 'ox', 'neut25k', 'extra25k', 'extra', 'oy', 'term', 's', 'bg'}) and_term = [t for t in j['data'] if t['term'] == 'and'][0] self.assertEqual(and_term['ox'], 0) self.assertEqual(and_term['oy'], 1)
def test_p_vals(self): tdm = build_hamlet_jz_term_doc_mat() j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, term_significance=LogOddsRatioUninformativeDirichletPrior()) .to_dict('hamlet')) datum = self._get_data_example(j) self.assertIn('p', datum.keys())
def test_terms_to_include(self): tdm = build_hamlet_jz_term_doc_mat() terms_to_include = list(sorted(['both worlds', 'thou', 'the', 'of', 'st', 'returned', 'best', ])) j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, terms_to_include=terms_to_include) .to_dict('hamlet', 'HAMLET', 'NOT HAMLET')) self.assertEqual(list(sorted(t['term'] for t in j['data'])), terms_to_include)
def test_to_dict_without_categories(self): tdm = get_term_doc_matrix_without_categories() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(NeedToInjectCoordinatesException): scatter_chart.to_dict_without_categories() x_coords = tdm.get_term_doc_mat().sum(axis=0).A1 y_coords = tdm.get_term_doc_mat().astype(bool).astype(int).sum( axis=0).A1 scatter_chart.inject_coordinates(original_x=x_coords, original_y=y_coords, x_coords=scale(x_coords), y_coords=scale(y_coords)) j = scatter_chart.to_dict_without_categories() self.assertIsInstance(j, dict) self.assertEqual(set(j.keys()), set(['data'])) self.assertEqual(len(j['data']), tdm.get_num_terms()) self.assertEqual( j['data'][-1], { 'cat': 4, 'cat25k': 735, 'ox': 4, 'oy': 3, 'term': 'speak', 'x': 1.0, 'y': 1.0 })
def produce_scattertext_html(term_doc_matrix, category, category_name, not_category_name, protocol='https', pmi_filter_thresold=2, minimum_term_frequency=3, max_terms=None, filter_unigrams=False, height_in_pixels=None, width_in_pixels=None, term_ranker=termranking.AbsoluteFrequencyRanker): '''Returns html code of visualization. Parameters ---------- term_doc_matrix : TermDocMatrix Corpus to use category : str name of category column category_name: str name of category to mine for not_category_name: str name of everything that isn't in category protocol : str optional, used prototcol of , http or https filter_unigrams : bool default False, do we filter unigrams that only occur in one bigram width_in_pixels: int width of viz in pixels, if None, default to JS's choice height_in_pixels: int height of viz in pixels, if None, default to JS's choice term_ranker : TermRanker TermRanker class for determining term frequency ranks. Returns ------- str, html of visualization ''' scatter_chart_data = ScatterChart(term_doc_matrix=term_doc_matrix, minimum_term_frequency=minimum_term_frequency, pmi_threshold_coefficient=pmi_filter_thresold, filter_unigrams=filter_unigrams, max_terms=max_terms, term_ranker=term_ranker) \ .to_dict(category=category, category_name=category_name, not_category_name=not_category_name, transform=percentile_alphabetical) html = HTMLVisualizationAssembly( VizDataAdapter(scatter_chart_data), width_in_pixels, height_in_pixels).to_html(protocol=protocol) return html
def test_max_terms(self): tdm = build_hamlet_jz_term_doc_mat() # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError): # ScatterChart(term_doc_matrix=tdm).to_dict('hamlet') j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, max_terms=2).to_dict('hamlet')) self.assertEqual(2, len(j['data'])) j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, max_terms=10).to_dict('hamlet')) self.assertEqual(10, len(j['data'])) j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, max_terms=10000).to_dict('hamlet')) self.assertEqual(51, len(j['data'])) j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, max_terms=None).to_dict('hamlet')) self.assertEqual(51, len(j['data']))
def test_inject_metadata_descriptions(self): tdm = build_hamlet_jz_corpus_with_meta() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(AssertionError): scatter_chart.inject_metadata_descriptions(3323) if (sys.version_info > (3, 0)): with self.assertRaisesRegex( Exception, 'The following meta data terms are not present: blah'): scatter_chart.inject_metadata_descriptions( {'blah': 'asjdkflasdjklfsadjk jsdkafsd'}) with self.assertRaisesRegex( Exception, 'The following meta data terms are not present: cat2'): scatter_chart.inject_metadata_descriptions({ 'cat1': 'asjdkflasdjklfsadjk jsdkafsd', 'cat2': 'asdf' }) assert scatter_chart == scatter_chart.inject_metadata_descriptions( {'cat1': 'asjdkflasdjklfsadjk jsdkafsd'}) j = scatter_chart.to_dict('hamlet') self.assertEqual(set(j.keys()), set(['info', 'data', 'metadescriptions']))
def test_score_transform(self): corpus = get_test_corpus() sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) d1 = sc.to_dict('hamlet') sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0, score_transform=lambda x: x) d2 = sc.to_dict('hamlet') assert sum([datum['s'] for datum in d1['data']]) != sum( [datum['s'] for datum in d2['data']])
def test_to_json_use_non_text_features(self): tdm = build_hamlet_jz_corpus_with_meta() # with self.assertRaises(NoWordMeetsTermFrequencyRequirementsError): # ScatterChart(term_doc_matrix=tdm).to_dict('hamlet') j = (ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0, use_non_text_features=True).to_dict('hamlet')) self.assertEqual(set(j.keys()), set(['info', 'data'])) self.assertEqual( set(j['info'].keys()), set([ 'not_category_name', 'category_name', 'category_terms', 'not_category_terms', 'category_internal_name' ])) self.assertEqual( {t['term'] for t in j['data']}, {'cat6', 'cat4', 'cat9', 'cat5', 'cat1', 'cat3', 'cat2'}) json.dumps(j)
def to_dict(self, category, category_name=None, not_category_name=None, scores=None, metadata=None, max_docs_per_category=None, transform=percentile_alphabetical): ''' :param category: Category to annotate :param category_name: Name of category which will appear on web site. :param not_category_name: Name of non-category axis which will appear on web site. :param scores: Scores to use. Default to Scaled F-Score. :param metadata: None or array-like. List of metadata for each document. :param max_docs_per_category: None or int. Maximum number of documents to store per category. :param transform: Defaults to percentile_lexicographic :return: dictionary {info: {category_name: ..., not_category_name}, docs: {'texts': [doc1text, ...], 'labels': [1, 0, ...], 'meta': ['<b>blah</b>', '<b>blah</b>']} data: {term:, x:frequency [0-1], y:frequency [0-1], s: score, bg: background score, as: association score, cat25k: freq per 25k in category, cat: count in category, ncat: count in non-category, catdocs: [docnum, ...], ncatdocs: [docnum, ...] ncat25k: freq per 25k in non-category}} ''' j = ScatterChart.to_dict(self, category, category_name=category_name, not_category_name=not_category_name, scores=scores, transform=transform) docs_getter = self._make_docs_getter(max_docs_per_category) j['docs'] = self._get_docs_structure(docs_getter, metadata) return j
def test_to_dict_without_categories(self): tdm = get_term_doc_matrix_without_categories() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(NeedToInjectCoordinatesException): scatter_chart.to_dict_without_categories() x_coords = tdm.get_term_doc_mat().sum(axis=0).A1 y_coords = tdm.get_term_doc_mat().astype(bool).astype(int).sum(axis=0).A1 scatter_chart.inject_coordinates(original_x=x_coords, original_y=y_coords, x_coords=scale(x_coords), y_coords=scale(y_coords)) j = scatter_chart.to_dict_without_categories() self.assertIsInstance(j, dict) self.assertEqual(set(j.keys()), set(['data'])) self.assertEqual(len(j['data']), tdm.get_num_terms()) self.assertEqual(j['data'][-1], {'cat': 4, 'cat25k': 735, 'ox': 4, 'oy': 3, 'term': 'speak', 'x': 1.0, 'y': 1.0})
def test_inject_coordinates(self): tdm = build_hamlet_jz_term_doc_mat() freq_df = tdm.get_term_freq_df() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates([], []) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(freq_df[freq_df.columns[0]], []) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates([], freq_df[freq_df.columns[0]]) x = freq_df[freq_df.columns[1]].astype(np.float) y = freq_df[freq_df.columns[0]].astype(np.float) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(x, y) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(x, y / y.max()) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(x / x.max(), y) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(-x / x.max(), -y / y.max()) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(-x / x.max(), y / y.max()) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(x / x.max(), -y / y.max()) scatter_chart.inject_coordinates(x / x.max(), y / y.max())
def _add_term_freq_to_json_df(self, json_df, term_freq_df, category): ScatterChart._add_term_freq_to_json_df(self, json_df, term_freq_df, category) json_df['cat'] = term_freq_df[category + ' freq'].astype(np.int) json_df['ncat'] = term_freq_df['not cat freq'].astype(np.int)
def to_dict(self, category, category_name=None, not_category_name=None, scores=None, metadata=None, max_docs_per_category=None, transform=percentile_alphabetical, alternative_text_field=None, title_case_names=False, not_categories=None, neutral_categories=None, extra_categories=None, neutral_category_name=None, extra_category_name=None, background_scorer=None, include_term_category_counts=False): ''' Parameters ---------- category : str Category to annotate. Exact value of category. category_name : str, optional Name of category which will appear on web site. Default None is same as category. not_category_name : str, optional Name of ~category which will appear on web site. Default None is same as "not " + category. scores : np.array, optional Scores to use for coloring. Defaults to None, or RankDifference scores metadata, None or array-like. List of metadata for each document. Defaults to a list of blank strings. max_docs_per_category, None or int, optional Maximum number of documents to store per category. Defaults to 4. transform : function, optional Function for ranking terms. Defaults to scattertext.Scalers.percentile_lexicographic. alternative_text_field : str or None, optional Field in from dataframe used to make corpus to display in place of parsed text. Only can be used if corpus is a ParsedCorpus instance. title_case_names : bool, default False Should the program title-case the category and not-category names? not_categories : list, optional List of categories to use as "not category". Defaults to all others. neutral_categories : list, optional List of categories to use as neutral. Defaults []. extra_categories : list, optional List of categories to use as extra. Defaults []. neutral_category_name : str "Neutral" by default. Only active if show_neutral is True. Name of the neutra l column. extra_category_name : str "Extra" by default. Only active if show_neutral and show_extra are true. Name of the extra column. background_scorer : CharacteristicScorer, optional Used for bg scores include_term_category_counts : bool, default False Includes term-category counts in keyed off 'term-category-count'. If use_non_text_features, use metadata counts instead. Returns ------- dictionary {info: {'category_name': full category name, ...}, docs: {'texts': [doc1text, ...], 'labels': [1, 0, ...], 'meta': ['<b>blah</b>', '<b>blah</b>']}, // if include_term_category_counts termCounts: [term num -> [total occurrences, total documents, variance], ... for the number of categories] data: {term:term, x:frequency [0-1], y:frequency [0-1], s: score, bg: background score, as: association score, cat25k: freq per 25k in category, cat: count in category, ncat: count in non-category, catdocs: [docnum, ...], ncatdocs: [docnum, ...] ncat25k: freq per 25k in non-category}} ''' json_data = ScatterChart.to_dict(self, category, category_name=category_name, not_category_name=not_category_name, scores=scores, transform=transform, title_case_names=title_case_names, not_categories=not_categories, neutral_categories=neutral_categories, extra_categories=extra_categories, background_scorer=background_scorer) docs_getter = self._make_docs_getter(max_docs_per_category, alternative_text_field) if neutral_category_name is None: neutral_category_name = 'Neutral' if extra_category_name is None: extra_category_name = 'Extra' json_data['docs'] = self._get_docs_structure(docs_getter, metadata) json_data['info']['neutral_category_name'] = neutral_category_name json_data['info']['extra_category_name'] = extra_category_name if include_term_category_counts: terms = np.array( [term_struct['term'] for term_struct in json_data['data']]) json_data['termCounts'] = self._get_term_doc_counts(terms) return json_data
def test_inject_metadata_term_lists(self): tdm = build_hamlet_jz_term_doc_mat() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(TermDocMatrixHasNoMetadataException): scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']}) scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(), minimum_term_frequency=0, use_non_text_features=True) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({'blash': [3, 1]}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({3: ['a', 'b']}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists(3) self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart) j = scatter_chart.to_dict('hamlet') self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists'])) self.assertEqual(set(j['info'].keys()), set(['not_category_name', 'category_name', 'category_terms', 'not_category_terms', 'category_internal_name', 'not_category_internal_names', 'extra_category_internal_names', 'neutral_category_internal_names', 'categories']))
def test_resuse_is_disabled(self): corpus = get_test_corpus() sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) sc.to_dict('hamlet') with self.assertRaises(Exception): sc.to_dict('hamlet')
def test_resuse_is_disabled(self): corpus = get_test_corpus() sc = ScatterChart(term_doc_matrix=corpus, minimum_term_frequency=0) sc.to_dict('hamlet') with self.assertRaises(Exception): sc.to_dict('hamlet')
def test_inject_metadata_term_lists(self): tdm = build_hamlet_jz_term_doc_mat() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(TermDocMatrixHasNoMetadataException): scatter_chart.inject_metadata_term_lists({'blah': ['a', 'adsf', 'asfd']}) scatter_chart = ScatterChart(term_doc_matrix=build_hamlet_jz_corpus_with_meta(), minimum_term_frequency=0, use_non_text_features=True) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({'blash': [3, 1]}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({3: ['a', 'b']}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists({'a': {'a', 'b'}}) with self.assertRaises(TypeError): scatter_chart.inject_metadata_term_lists(3) self.assertEqual(type(scatter_chart.inject_metadata_term_lists({'a': ['a', 'b']})), ScatterChart) j = scatter_chart.to_dict('hamlet') self.assertEqual(set(j.keys()), set(['info', 'data', 'metalists'])) self.assertEqual(set(j['info'].keys()), set(['not_category_name', 'category_name', 'category_terms', 'not_category_terms', 'category_internal_name', 'not_category_internal_names', 'extra_category_internal_names', 'neutral_category_internal_names', 'categories']))
def test_inject_coordinates(self): tdm = build_hamlet_jz_term_doc_mat() freq_df = tdm.get_term_freq_df() scatter_chart = ScatterChart(term_doc_matrix=tdm, minimum_term_frequency=0) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates([], []) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(freq_df[freq_df.columns[0]], []) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates([], freq_df[freq_df.columns[0]]) x = freq_df[freq_df.columns[1]].astype(np.float) y = freq_df[freq_df.columns[0]].astype(np.float) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(x, y) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(x, y / y.max()) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(x / x.max(), y) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(-x / x.max(), -y / y.max()) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(-x / x.max(), y / y.max()) with self.assertRaises(CoordinatesNotRightException): scatter_chart.inject_coordinates(x / x.max(), -y / y.max()) scatter_chart.inject_coordinates(x / x.max(), y / y.max())
def _add_term_freq_to_json_df(self, json_df, term_freq_df, category): ScatterChart._add_term_freq_to_json_df(self, json_df, term_freq_df, category) json_df['cat'] = term_freq_df[category + ' freq'].astype(np.int) json_df['ncat'] = term_freq_df['not cat freq'].astype(np.int) if self._term_metadata is not None: json_df['etc'] = term_freq_df['term'].apply(lambda term: self._term_metadata.get(term, {}))
def to_dict(self, category, category_name=None, not_category_name=None, scores=None, metadata=None, max_docs_per_category=None, transform=percentile_alphabetical, alternative_text_field=None, title_case_names=False, not_categories=None, neutral_categories=None, extra_categories=None, neutral_category_name=None, extra_category_name=None, background_scorer=None, include_term_category_counts=False): ''' Parameters ---------- category : str Category to annotate. Exact value of category. category_name : str, optional Name of category which will appear on web site. Default None is same as category. not_category_name : str, optional Name of ~category which will appear on web site. Default None is same as "not " + category. scores : np.array, optional Scores to use for coloring. Defaults to None, or RankDifference scores metadata, None or array-like. List of metadata for each document. Defaults to a list of blank strings. max_docs_per_category, None or int, optional Maximum number of documents to store per category. Defaults to 4. transform : function, optional Function for ranking terms. Defaults to scattertext.Scalers.percentile_lexicographic. alternative_text_field : str or None, optional Field in from dataframe used to make corpus to display in place of parsed text. Only can be used if corpus is a ParsedCorpus instance. title_case_names : bool, default False Should the program title-case the category and not-category names? not_categories : list, optional List of categories to use as "not category". Defaults to all others. neutral_categories : list, optional List of categories to use as neutral. Defaults []. extra_categories : list, optional List of categories to use as extra. Defaults []. neutral_category_name : str "Neutral" by default. Only active if show_neutral is True. Name of the neutra l column. extra_category_name : str "Extra" by default. Only active if show_neutral and show_extra are true. Name of the extra column. background_scorer : CharacteristicScorer, optional Used for bg scores include_term_category_counts : bool, default False Includes term-category counts in keyed off 'term-category-count'. If use_non_text_features, use metadata counts instead. Returns ------- dictionary {info: {'category_name': full category name, ...}, docs: {'texts': [doc1text, ...], 'labels': [1, 0, ...], 'meta': ['<b>blah</b>', '<b>blah</b>']}, // if include_term_category_counts termCounts: [term num -> [total occurrences, total documents, variance], ... for the number of categories] data: {term:term, x:frequency [0-1], y:frequency [0-1], s: score, bg: background score, as: association score, cat25k: freq per 25k in category, cat: count in category, ncat: count in non-category, catdocs: [docnum, ...], ncatdocs: [docnum, ...] ncat25k: freq per 25k in non-category} etc: term specific dictionary (if inject_term_metadata is called and contains terms)} ''' json_data = ScatterChart.to_dict(self, category, category_name=category_name, not_category_name=not_category_name, scores=scores, transform=transform, title_case_names=title_case_names, not_categories=not_categories, neutral_categories=neutral_categories, extra_categories=extra_categories, background_scorer=background_scorer) docs_getter = self._make_docs_getter(max_docs_per_category, alternative_text_field) if neutral_category_name is None: neutral_category_name = 'Neutral' if extra_category_name is None: extra_category_name = 'Extra' json_data['docs'] = self._get_docs_structure(docs_getter, metadata) json_data['info']['neutral_category_name'] = neutral_category_name json_data['info']['extra_category_name'] = extra_category_name if include_term_category_counts: terms = np.array([term_struct['term'] for term_struct in json_data['data']]) json_data['termCounts'] = self._get_term_doc_counts(terms) return json_data