Example #1
0
def test_filter_terms_by_df_min_df(vectorizer_and_dtm):
    vectorizer, doc_term_matrix = vectorizer_and_dtm
    dtm, vocab = vsm.filter_terms_by_df(
        doc_term_matrix, vectorizer.vocabulary_terms,
        max_df=1.0, min_df=2, max_n_terms=None)
    assert dtm.shape == (8, 7)
    assert sorted(vocab.keys()) == ['-PRON-', 'child', 'lamb', 'love', 'mary', 'school', 'teacher']
Example #2
0
 def test_filter_terms_by_df_min_df(self):
     dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix, self.id_to_term,
                                       max_df=1.0, min_df=2, max_n_terms=None)
     self.assertEqual(dtm.shape, (8, 6))
     self.assertEqual(
         sorted(i2t.values()),
         ['child', 'lamb', 'love', 'mary', 'school', 'teacher'])
Example #3
0
def test_filter_terms_by_df_max_n_terms(vectorizer_and_dtm):
    vectorizer, doc_term_matrix = vectorizer_and_dtm
    dtm, vocab = vsm.filter_terms_by_df(
        doc_term_matrix, vectorizer.vocabulary_terms,
        max_df=1.0, min_df=1, max_n_terms=2)
    assert dtm.shape == (8, 2)
    assert sorted(vocab.keys()) == ['lamb', 'mary']
Example #4
0
def test_filter_terms_by_df_identity(vectorizer_and_dtm):
    vectorizer, doc_term_matrix = vectorizer_and_dtm
    dtm, vocab = vsm.filter_terms_by_df(
        doc_term_matrix, vectorizer.vocabulary_terms,
        max_df=1.0, min_df=1, max_n_terms=None)
    assert dtm.shape == doc_term_matrix.shape
    assert vocab == vectorizer.vocabulary_terms
Example #5
0
 def test_filter_terms_by_df_identity(self):
     dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix,
                                       self.id_to_term,
                                       max_df=1.0,
                                       min_df=1,
                                       max_n_terms=None)
     self.assertEqual(dtm.shape, self.doc_term_matrix.shape)
     self.assertEqual(i2t, self.id_to_term)
Example #6
0
 def test_filter_terms_by_df_max_n_terms(self):
     dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix,
                                       self.id_to_term,
                                       max_df=1.0,
                                       min_df=1,
                                       max_n_terms=2)
     self.assertEqual(dtm.shape, (8, 2))
     self.assertEqual(sorted(i2t.values()), ['lamb', 'mary'])
Example #7
0
def test_filter_terms_by_df_exception(vectorizer_and_dtm):
    vectorizer, doc_term_matrix = vectorizer_and_dtm
    with pytest.raises(ValueError):
        _ = vsm.filter_terms_by_df(doc_term_matrix,
                                   vectorizer.vocabulary_terms,
                                   max_df=1.0,
                                   min_df=6,
                                   max_n_terms=None)
Example #8
0
 def test_filter_terms_by_df_max_n_terms(self):
     dtm, vocab = vsm.filter_terms_by_df(self.doc_term_matrix,
                                         self.vectorizer.vocabulary,
                                         max_df=1.0,
                                         min_df=1,
                                         max_n_terms=2)
     self.assertEqual(dtm.shape, (8, 2))
     self.assertEqual(sorted(vocab.keys()), ['lamb', 'mary'])
Example #9
0
 def test_filter_terms_by_df_identity(self):
     dtm, vocab = vsm.filter_terms_by_df(self.doc_term_matrix,
                                         self.vectorizer.vocabulary,
                                         max_df=1.0,
                                         min_df=1,
                                         max_n_terms=None)
     self.assertEqual(dtm.shape, self.doc_term_matrix.shape)
     self.assertEqual(vocab, self.vectorizer.vocabulary)
Example #10
0
 def test_filter_terms_by_df_min_df(self):
     dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix,
                                       self.id_to_term,
                                       max_df=1.0,
                                       min_df=2,
                                       max_n_terms=None)
     self.assertEqual(dtm.shape, (8, 6))
     self.assertEqual(
         sorted(i2t.values()),
         ['child', 'lamb', 'love', 'mary', 'school', 'teacher'])
Example #11
0
 def test_filter_terms_by_df_min_df(self):
     dtm, vocab = vsm.filter_terms_by_df(self.doc_term_matrix,
                                         self.vectorizer.vocabulary,
                                         max_df=1.0,
                                         min_df=2,
                                         max_n_terms=None)
     self.assertEqual(dtm.shape, (8, 7))
     self.assertEqual(
         sorted(vocab.keys()),
         ['-PRON-', 'child', 'lamb', 'love', 'mary', 'school', 'teacher'])
Example #12
0
def test_filter_terms_by_df_min_df(vectorizer_and_dtm):
    vectorizer, doc_term_matrix = vectorizer_and_dtm
    dtm, vocab = vsm.filter_terms_by_df(
        doc_term_matrix,
        vectorizer.vocabulary_terms,
        max_df=1.0,
        min_df=2,
        max_n_terms=None,
    )
    assert dtm.shape[0] == doc_term_matrix.shape[0]
    assert dtm.shape[1] < doc_term_matrix.shape[1]
    assert all(term in vocab for term in ("children", "lamb", "mary", "school", "teacher"))
Example #13
0
 def test_filter_terms_by_df_max_n_terms(self):
     dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix, self.id_to_term,
                                       max_df=1.0, min_df=1, max_n_terms=2)
     self.assertEqual(dtm.shape, (8, 2))
     self.assertEqual(sorted(i2t.values()), ['lamb', 'mary'])
Example #14
0
 def test_filter_terms_by_df_identity(self):
     dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix, self.id_to_term,
                                       max_df=1.0, min_df=1, max_n_terms=None)
     self.assertEqual(dtm.shape, self.doc_term_matrix.shape)
     self.assertEqual(i2t, self.id_to_term)