def test_get_term_freqs_sublinear(vectorizer_and_dtm, lamb_and_child_idxs): _, doc_term_matrix = vectorizer_and_dtm idx_lamb, idx_child = lamb_and_child_idxs tfs = vsm.get_term_freqs(doc_term_matrix, type_='linear') tfs_sqrt = vsm.get_term_freqs(doc_term_matrix, type_='sqrt') tfs_log = vsm.get_term_freqs(doc_term_matrix, type_='log') assert len(tfs) == len(tfs_sqrt) == len(tfs_log) == doc_term_matrix.shape[1] assert tfs_log.max() == pytest.approx(2.60943, abs=1e-3) assert tfs_log.min() == pytest.approx(1.0, abs=1e-3) assert tfs_log[idx_lamb] == pytest.approx(2.60943, abs=1e-3) assert tfs_log[idx_child] == pytest.approx(1.69314, abs=1e-3) assert (tfs_sqrt == np.sqrt(tfs)).all() assert (tfs_log == np.log(tfs) + 1.0).all()
def test_get_term_freqs_normalized(self): term_freqs = vsm.get_term_freqs(self.doc_term_matrix, normalized=True) self.assertEqual(len(term_freqs), self.doc_term_matrix.shape[1]) self.assertAlmostEqual(term_freqs.max(), 0.19230, places=4) self.assertAlmostEqual(term_freqs.min(), 0.03846, places=4) self.assertAlmostEqual(term_freqs[self.idx_lamb], 0.1923, places=4) self.assertAlmostEqual(term_freqs[self.idx_child], 0.07692, places=4)
def test_get_term_freqs(self): term_freqs = vsm.get_term_freqs(self.doc_term_matrix, normalized=False) self.assertEqual(len(term_freqs), self.doc_term_matrix.shape[1]) self.assertEqual(term_freqs.min(), 1) self.assertEqual(term_freqs.max(), 5) self.assertEqual(term_freqs[self.idx_lamb], 5) self.assertEqual(term_freqs[self.idx_child], 2)
def test_get_term_freqs(vectorizer_and_dtm, lamb_and_child_idxs): _, doc_term_matrix = vectorizer_and_dtm idx_lamb, idx_child = lamb_and_child_idxs term_freqs = vsm.get_term_freqs(doc_term_matrix, type_='linear') assert len(term_freqs) == doc_term_matrix.shape[1] assert term_freqs.min() == 1 assert term_freqs.max() == 5 assert term_freqs[idx_lamb] == 5 assert term_freqs[idx_child] == 2
def test_get_term_freqs_normalized(vectorizer_and_dtm, lamb_and_child_idxs): _, doc_term_matrix = vectorizer_and_dtm idx_lamb, idx_child = lamb_and_child_idxs term_freqs = vsm.get_term_freqs(doc_term_matrix, normalized=True) assert len(term_freqs) == doc_term_matrix.shape[1] assert term_freqs.max() == pytest.approx(0.15625, abs=1e-3) assert term_freqs.min() == pytest.approx(0.03125, abs=1e-3) assert term_freqs[idx_lamb] == pytest.approx(0.15625, abs=1e-3) assert term_freqs[idx_child] == pytest.approx(0.06250, abs=1e-3)
def test_get_term_freqs_exception(): with pytest.raises(ValueError): _ = vsm.get_term_freqs(coo_matrix((1, 1)).tocsr())