def test_filter_terms_by_df_min_df(vectorizer_and_dtm): vectorizer, doc_term_matrix = vectorizer_and_dtm dtm, vocab = vsm.filter_terms_by_df( doc_term_matrix, vectorizer.vocabulary_terms, max_df=1.0, min_df=2, max_n_terms=None) assert dtm.shape == (8, 7) assert sorted(vocab.keys()) == ['-PRON-', 'child', 'lamb', 'love', 'mary', 'school', 'teacher']
def test_filter_terms_by_df_min_df(self): dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix, self.id_to_term, max_df=1.0, min_df=2, max_n_terms=None) self.assertEqual(dtm.shape, (8, 6)) self.assertEqual( sorted(i2t.values()), ['child', 'lamb', 'love', 'mary', 'school', 'teacher'])
def test_filter_terms_by_df_max_n_terms(vectorizer_and_dtm): vectorizer, doc_term_matrix = vectorizer_and_dtm dtm, vocab = vsm.filter_terms_by_df( doc_term_matrix, vectorizer.vocabulary_terms, max_df=1.0, min_df=1, max_n_terms=2) assert dtm.shape == (8, 2) assert sorted(vocab.keys()) == ['lamb', 'mary']
def test_filter_terms_by_df_identity(vectorizer_and_dtm): vectorizer, doc_term_matrix = vectorizer_and_dtm dtm, vocab = vsm.filter_terms_by_df( doc_term_matrix, vectorizer.vocabulary_terms, max_df=1.0, min_df=1, max_n_terms=None) assert dtm.shape == doc_term_matrix.shape assert vocab == vectorizer.vocabulary_terms
def test_filter_terms_by_df_identity(self): dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix, self.id_to_term, max_df=1.0, min_df=1, max_n_terms=None) self.assertEqual(dtm.shape, self.doc_term_matrix.shape) self.assertEqual(i2t, self.id_to_term)
def test_filter_terms_by_df_max_n_terms(self): dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix, self.id_to_term, max_df=1.0, min_df=1, max_n_terms=2) self.assertEqual(dtm.shape, (8, 2)) self.assertEqual(sorted(i2t.values()), ['lamb', 'mary'])
def test_filter_terms_by_df_exception(vectorizer_and_dtm): vectorizer, doc_term_matrix = vectorizer_and_dtm with pytest.raises(ValueError): _ = vsm.filter_terms_by_df(doc_term_matrix, vectorizer.vocabulary_terms, max_df=1.0, min_df=6, max_n_terms=None)
def test_filter_terms_by_df_max_n_terms(self): dtm, vocab = vsm.filter_terms_by_df(self.doc_term_matrix, self.vectorizer.vocabulary, max_df=1.0, min_df=1, max_n_terms=2) self.assertEqual(dtm.shape, (8, 2)) self.assertEqual(sorted(vocab.keys()), ['lamb', 'mary'])
def test_filter_terms_by_df_identity(self): dtm, vocab = vsm.filter_terms_by_df(self.doc_term_matrix, self.vectorizer.vocabulary, max_df=1.0, min_df=1, max_n_terms=None) self.assertEqual(dtm.shape, self.doc_term_matrix.shape) self.assertEqual(vocab, self.vectorizer.vocabulary)
def test_filter_terms_by_df_min_df(self): dtm, vocab = vsm.filter_terms_by_df(self.doc_term_matrix, self.vectorizer.vocabulary, max_df=1.0, min_df=2, max_n_terms=None) self.assertEqual(dtm.shape, (8, 7)) self.assertEqual( sorted(vocab.keys()), ['-PRON-', 'child', 'lamb', 'love', 'mary', 'school', 'teacher'])
def test_filter_terms_by_df_min_df(vectorizer_and_dtm): vectorizer, doc_term_matrix = vectorizer_and_dtm dtm, vocab = vsm.filter_terms_by_df( doc_term_matrix, vectorizer.vocabulary_terms, max_df=1.0, min_df=2, max_n_terms=None, ) assert dtm.shape[0] == doc_term_matrix.shape[0] assert dtm.shape[1] < doc_term_matrix.shape[1] assert all(term in vocab for term in ("children", "lamb", "mary", "school", "teacher"))