def test_keep_only_these_categories(self): df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T, columns=['category', 'text']) corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build() hamlet_swift_corpus = corpus.keep_only_these_categories(['hamlet', 'swift']) self.assertEqual(hamlet_swift_corpus.get_categories(), ['hamlet', 'swift']) self.assertGreater(len(corpus.get_terms()), len(hamlet_swift_corpus.get_terms())) with self.assertRaises(AssertionError): corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd']) corpus.keep_only_these_categories(['hamlet', 'swift', 'asdjklasfd'], True)
def test_remove_categories(self): df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T, columns=['category', 'text']) corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build() swiftless = corpus.remove_categories(['swift']) swiftless_constructed = CorpusFromPandas(df[df['category'] != 'swift'], 'category', 'text', nlp=whitespace_nlp).build() np.testing.assert_equal([ i for i in corpus._y if i != corpus.get_categories().index('swift') ], swiftless._y) self.assertEqual(swiftless._y.shape[0], swiftless._X.shape[0]) self.assertEqual(swiftless_constructed._X.shape, swiftless._X.shape) self.assertEqual(set(swiftless_constructed.get_terms()), set(swiftless.get_terms())) pd.testing.assert_series_equal(swiftless_constructed.get_texts(), swiftless.get_texts())
def test_remove_categories(self): df = pd.DataFrame(data=pd.np.array(get_docs_categories_semiotic()).T, columns=['category', 'text']) corpus = CorpusFromPandas(df, 'category', 'text', nlp=whitespace_nlp).build() swiftless = corpus.remove_categories(['swift']) swiftless_constructed = CorpusFromPandas(df[df['category'] != 'swift'], 'category', 'text', nlp=whitespace_nlp).build() np.testing.assert_equal( [i for i in corpus._y if i != corpus.get_categories().index('swift')], swiftless._y ) self.assertEqual(swiftless._y.shape[0], swiftless._X.shape[0]) self.assertEqual(swiftless_constructed._X.shape, swiftless._X.shape) self.assertEqual(set(swiftless_constructed.get_terms()), set(swiftless.get_terms())) pd.testing.assert_series_equal(swiftless_constructed.get_texts(), swiftless.get_texts())
not_category_name='Republican', minimum_term_frequency=5, width_in_pixels=1000, x_coords=frequencies_scaled, y_coords=zeta_scaled_for_charting, scores=zeta_i_j, sort_by_dist=False, metadata=convention_df['speaker'], x_label='Log Frequency', y_label='Log Odds Ratio w/ Prior (a_w=0.01)') ''' bg_df = (corpus.get_term_and_background_counts().where( lambda x: x.corpus > 0).dropna()) bg_df.background += bg_df.corpus corpus_bg = corpus.remove_terms(set(corpus.get_terms()) - set(bg_df.index)) priors = (corpus_bg.get_term_and_background_counts().reindex( corpus_bg.get_terms())['background']) term_scorer = LogOddsRatioInformativeDirichletPrior(priors.values, 10) tooltip_context = '''(function(d) { return d.term+"<br/>Count ratio (per 25k): "+d.cat25k+":"+d.ncat25k+"<br/>Z-score: "+ Number(Math.round(d.os+'e3')+'e-3'); })''' html = produce_fightin_words_explorer(corpus_bg, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=5, get_tooltip_content=tooltip_context, term_scorer=term_scorer)