def test_sparse_tf_idf(): hv = SparseHashingVectorizer(dim=1000000, probes=3) hv.vectorize(JUNK_FOOD_DOCS) hv.vectorize(NOTJUNK_FOOD_DOCS) # extract the TF-IDF data X = hv.get_tfidf() assert_equal(X.shape, (11, 1000000)) # label junk food as -1, the others as +1 y = np.ones(X.shape[0]) y[:6] = -1 # train and test a classifier clf = SVC(kernel='linear', C=10).fit(X[1:-1], y[1:-1]) assert_equal(clf.predict(X[0, :]), [-1]) assert_equal(clf.predict(X[-1, :]), [1])
print "news_train.data is sparse: ", print sp.issparse(news_train.data) # The documents have been hashed into TF-IDF (Term Frequencies times Inverse # Document Frequencies) vectors of a fixed dimension. print "n_samples: %d, n_features: %d" % news_train.data.shape print "Training a linear SVM (hinge loss and L2 regularizer)..." parameters = { 'kernel': 'linear', 'C': 10, } print "parameters:", parameters t0 = time() clf = SVC(**parameters).fit(news_train.data, news_train.target) print "done in %fs" % (time() - t0) print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100) print "Loading 20 newsgroups test set... " t0 = time() news_test = load_mlcomp('20news-18828', 'test', sparse=True) print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." t0 = time() pred = clf.predict(news_test.data) print "done in %fs" % (time() - t0) print "Classification accuracy: %f" % (np.mean(pred == news_test.target) * 100) cm = confusion_matrix(news_test.target, pred)