def test_dense_tf_idf(): hv = HashingVectorizer(dim=1000, probes=3) hv.vectorize(JUNK_FOOD_DOCS) hv.vectorize(NOTJUNK_FOOD_DOCS) # extract the TF-IDF data X = hv.get_tfidf() assert_equal(X.shape, (11, 1000)) # label junk food as -1, the others as +1 y = np.ones(X.shape[0]) y[:6] = -1 # train and test a classifier clf = DenseLinearSVC(C=10).fit(X[1:-1], y[1:-1]) assert_equal(clf.predict([X[0]]), [-1]) assert_equal(clf.predict([X[-1]]), [1])
def test_dense_sparse_idf_sanity(): hv = HashingVectorizer(dim=100, probes=3) shv = SparseHashingVectorizer(dim=100, probes=3) hv.vectorize(JUNK_FOOD_DOCS) shv.vectorize(JUNK_FOOD_DOCS) # check that running TF IDF estimates are the same dense_tf_idf = hv.get_tfidf() sparse_tfidf = shv.get_tfidf().todense() assert_array_almost_equal(dense_tf_idf, sparse_tfidf) # check that incremental behaviour stays the same hv.vectorize(NOTJUNK_FOOD_DOCS) shv.vectorize(NOTJUNK_FOOD_DOCS) dense_tf_idf = hv.get_tfidf() sparse_tfidf = shv.get_tfidf().todense() assert_array_almost_equal(dense_tf_idf, sparse_tfidf)