for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print 80 * '=' print name results = benchmark(clf) for penalty in ["l2", "l1"]: print 80 * '=' print "%s penalty" % penalty.upper() # Train Liblinear model liblinear_results = benchmark( LinearSVC(loss='l2', penalty=penalty, C=1000, dual=False, tol=1e-3)) # Train SGD model sgd_results = benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)) # Train SGD with Elastic Net penalty print 80 * '=' print "Elastic-Net penalty" sgd_results = benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")) # Train sparse Naive Bayes classifiers print 80 * '=' print "Naive Bayes" mnnb_results = benchmark(MultinomialNB(alpha=.01)) bnb_result = benchmark(BernoulliNB(alpha=.01)) class L1LinearSVC(LinearSVC):
for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print 80 * '=' print name results = benchmark(clf) for penalty in ["l2", "l1"]: print 80 * '=' print "%s penalty" % penalty.upper() # Train Liblinear model liblinear_results = benchmark(LinearSVC(loss='l2', penalty=penalty, C=1000, dual=False, tol=1e-3)) # Train SGD model sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)) # Train SGD with Elastic Net penalty print 80 * '=' print "Elastic-Net penalty" sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")) # Train sparse Naive Bayes classifiers print 80 * '=' print "Naive Bayes" mnnb_results = benchmark(MultinomialNB(alpha=.01)) bnb_result = benchmark(BernoulliNB(alpha=.01)) class L1LinearSVC(LinearSVC):
print "Loading 20 newsgroups dataset for categories:" print categories data = fetch_20newsgroups(subset='train', categories=categories) print "%d documents" % len(data.filenames) print "%d categories" % len(data.target_names) print ################################################################################ # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) parameters = { # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way 'vect__max_df': (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), 'vect__analyzer__max_n': (1, 2), # words or bigrams # 'tfidf__use_idf': (True, False), # 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.00001, 0.000001), 'clf__penalty': ('l2', 'elasticnet'), # 'clf__n_iter': (10, 50, 80), }
def find_best_sgd(**params): parameters = { 'alpha': [0.0001, 0.0005, 0.001], 'rho': [0.80, 0.85, 0.95], } return GridSearchCV(SGDClassifier(**params), parameters)