print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Training a linear classifier..." parameters = { 'loss': 'hinge', 'penalty': 'l2', 'n_iter': 50, 'alpha': 0.00001, 'fit_intercept': True, } print "parameters:", parameters t0 = time() clf = SGDClassifier(**parameters).fit(X_train, y_train) print "done in %fs" % (time() - t0) print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100) print "Loading 20 newsgroups test set... " news_test = load_mlcomp('20news-18828', 'test') t0 = time() print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names) print "Extracting features from the dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform((open(f).read() for f in news_test.filenames))
return score, train_time, test_time for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"), (NeighborsClassifier(n_neighbors=10), "kNN")): print 80*'=' print name results = benchmark(clf) for penalty in ["l2", "l1"]: print 80 * '=' print "%s penalty" % penalty.upper() # Train Liblinear model liblinear_results = benchmark(LinearSVC(loss='l2', penalty=penalty, C=1000, dual=False, tol=1e-3)) # Train SGD model sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)) # Train SGD with Elastic Net penalty print 80 * '=' print "Elastic-Net penalty" sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")) # Train sparse Naive Bayes classifiers print 80 * '=' print "Naive Bayes" mnnb_results = benchmark(MultinomialNB(alpha=.01)) bnb_result = benchmark(BernoulliNB(alpha=.01))
print "Loading 20 newsgroups dataset for categories:" print categories data = load_20newsgroups(subset='train', categories=categories) print "%d documents" % len(data.filenames) print "%d categories" % len(data.target_names) print ################################################################################ # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) parameters = { # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way 'vect__max_df': (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), 'vect__analyzer__max_n': (1, 2), # words or bigrams # 'tfidf__use_idf': (True, False), 'clf__alpha': (0.00001, 0.000001), 'clf__penalty': ('l2', 'elasticnet'), # 'clf__n_iter': (10, 50, 80), } # find the best parameters for both the feature extraction and the
if print_cm: print "confusion matrix:" print metrics.confusion_matrix(y_test, pred) print return score, train_time, test_time for clf, name in ((RidgeClassifier(), "Ridge Classifier"), ): print 80 * '=' print name results = benchmark(clf) for penalty in ["l2", "l1"]: print 80 * '=' print "%s penalty" % penalty.upper() # Train Liblinear model liblinear_results = benchmark( LinearSVC(loss='l2', penalty=penalty, C=1000, dual=False, tol=1e-3)) # Train SGD model sgd_results = benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)) # Train SGD with Elastic Net penalty print 80 * '=' print "Elastic-Net penalty" sgd_results = benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))