def test_sparse_tf_idf(): hv = SparseHashingVectorizer(dim=1000000, probes=3) hv.vectorize(JUNK_FOOD_DOCS) hv.vectorize(NOTJUNK_FOOD_DOCS) # extract the TF-IDF data X = hv.get_tfidf() assert_equal(X.shape, (11, 1000000)) # label junk food as -1, the others as +1 y = np.ones(X.shape[0]) y[:6] = -1 # train and test a classifier clf = SparseLinearSVC(C=10).fit(X[1:-1], y[1:-1]) assert_equal(clf.predict(X[0, :]), [-1]) assert_equal(clf.predict(X[-1, :]), [1])
class SVM: def __init__(self, training, classes, vocabulary): vocabulary = load(vocabulary) self.cv = CountVectorizer(vocabulary = vocabulary.tolist()) self.samples = load(training).tolist() self.classes = load(classes) self.classifier = LinearSVC() self.classifier.fit(self.samples, self.classes) def classify(self, text): features = self.cv.transform([text]) return self.classifier.predict(features)[0]
print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names) print "Extracting features from the dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform((open(f).read() for f in news_test.filenames)) y_test = news_test.target print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print "Predicting the outcomes of the testing set" t0 = time() pred = clf.predict(X_test) print "done in %fs" % (time() - t0) <<<<<<< HEAD print "precision: %0.3f" % precision(y_test, pred) print "recall: %0.3f" % recall(y_test, pred) print "f1_score: %0.3f" % f1_score(y_test, pred) ======= print "Classification report on test set:" print classification_report(news_test.target, pred, class_names=news_test.target_names) >>>>>>> remote cm = confusion_matrix(y_test, pred) print "Confusion matrix:"
'C': 10, 'dual': False, 'eps': 1e-4, } print "parameters:", parameters t0 = time() clf = LinearSVC(**parameters).fit(news_train.data, news_train.target) print "done in %fs" % (time() - t0) print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100) print "Loading 20 newsgroups test set... " t0 = time() news_test = load_mlcomp('20news-18828', 'test', sparse=True) print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." t0 = time() pred = clf.predict(news_test.data) print "done in %fs" % (time() - t0) print "Classification accuracy: %f" % (np.mean(pred == news_test.target) * 100) cm = confusion_matrix(news_test.target, pred) print "Confusion matrix:" print cm # Show confusion matrix pl.matshow(cm) pl.title('Confusion matrix') pl.colorbar() pl.show()
print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names) print "Extracting features from the dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform((open(f).read() for f in news_test.filenames)) y_test = news_test.target print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_test.shape print "Predicting the outcomes of the testing set" t0 = time() pred = clf.predict(X_test) print "done in %fs" % (time() - t0) print "Classification report on test set for classifier:" print clf print print classification_report(y_test, pred, class_names=news_test.target_names) cm = confusion_matrix(y_test, pred) print "Confusion matrix:" print cm # Show confusion matrix pl.matshow(cm) pl.title("Confusion matrix") pl.colorbar()