def fit(self, X, y): # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. self.transformer_ = LinearSVC(C=1000, penalty="l1", dual=False, tol=1e-3) X = self.transformer_.fit_transform(X, y) return LinearSVC.fit(self, X, y)
def test_dense_vectorizer_pipeline_grid_selection(): # raw documents data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS # simulate iterables train_data = iter(data[1:-1]) test_data = iter([data[0], data[-1]]) # label junk food as -1, the others as +1 y = np.ones(len(data)) y[:6] = -1 y_train = y[1:-1] y_test = np.array([y[0], y[-1]]) pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())]) parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')} # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=1) # cross-validation doesn't work if the length of the data is not known, # hence use lists instead of iterators pred = grid_search.fit(list(train_data), y_train).predict(list(test_data)) assert_array_equal(pred, y_test) # on this toy dataset bigram representation which is used in the last of # the grid_search is considered the best estimator since they all converge # to 100% accuracy models assert_equal(grid_search.best_score, 1.0) best_vectorizer = grid_search.best_estimator.named_steps['vect'] assert_equal(best_vectorizer.analyzer.max_n, 1)
def train(labeled_featuresets, C=1e5): """ :param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples ``(featureset, label)``. """ feat = [featureset for featureset, label in labeled_featuresets] feature_vectorizer = MVectorizer.DictsVectorizer() X = feature_vectorizer.fit_transform(feat) X = Normalizer().fit_transform(X) label_set = set( [label for featureset, label in labeled_featuresets] ) label_vectorizer = dict( [(label,num) for num,label in enumerate(label_set)] ) y = numpy.array([label_vectorizer[label] for featureset, label in labeled_featuresets]) print "Training on %d examples with %d features..."%(X.shape[0],X.shape[1]), classifier = OneVsRestClassifier(LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-5, C=C, scale_C=True)) classifier.fit(X,y) print "done" return scikit_classifier(feature_vectorizer,label_vectorizer,classifier)
print return score, train_time, test_time for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print 80 * '=' print name results = benchmark(clf) for penalty in ["l2", "l1"]: print 80 * '=' print "%s penalty" % penalty.upper() # Train Liblinear model liblinear_results = benchmark( LinearSVC(loss='l2', penalty=penalty, C=1000, dual=False, tol=1e-3)) # Train SGD model sgd_results = benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)) # Train SGD with Elastic Net penalty print 80 * '=' print "Elastic-Net penalty" sgd_results = benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")) # Train sparse Naive Bayes classifiers print 80 * '=' print "Naive Bayes" mnnb_results = benchmark(MultinomialNB(alpha=.01))
from preprocess import get_clf, load_data, preprocess_data from sklearn.metrics import classification_report from sklearn.cross_validation import KFold, LeaveOneOut from sklearn.grid_search import GridSearchCV if __name__ == '__main__': filename = 'inf-all-labeled.txt' X, y = load_data(filename) n = len(X) scores = np.empty((5, 2, 2), dtype=np.float) best_C = np.empty((5, 2, 2), dtype=np.float) for i, ngrams in enumerate((2, 3, 4, 5, 6)): for j, suffix in enumerate(('', '$')): for k, binarize in enumerate((True, False)): print "ngrams=%d, suffix=%s, binarize=%s" % (ngrams, suffix, binarize) X_new = preprocess_data(X, n=ngrams, suffix=suffix, binarize=binarize) grid = GridSearchCV( estimator=LinearSVC(), n_jobs=4, verbose=False, param_grid={'C': (0.01, 0.03, 0.1, 0.3, 1, 1.3)}, cv=LeaveOneOut(n, indices=True)) grid.fit(X_new, y) scores[i, j, k] = grid.best_score best_C[i, j, k] = grid.best_estimator.C
print # # Feature selection for the L1 dataset # select_chi2 = 1000 # print ("Extracting %d best features by a chi-squared test" % select_chi2) # t0 = time() # ch2 = SelectKBest(chi2, k = select_chi2) # X_L1 = ch2.fit_transform(X_L1, y_L1) # print "Done in %fs" % (time() - t0) # print "L1: n_samples: %d, n_features: %d" % X_L1.shape # print # Train L1 classifier print "Training L1 Classifier..." t0 = time() clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) print clf clf.fit(X_L1, y_L1) train_time = time() - t0 print "Train time: %0.3fs" % train_time print # Train L2 classifiers print "Training L2 Classifiers..." t0 = time() # comment out all linearSVC # clf_ca = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) # clf_collect = LinearSVC(loss='l2', penalty='l2', C=256, dual=False, tol=1e-2) # clf_cookies = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) # clf_share = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
print metrics.confusion_matrix(y_test, pred) print return score, train_time, test_time for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print 80 * '=' print name results = benchmark(clf) for penalty in ["l2", "l1"]: print 80 * '=' print "%s penalty" % penalty.upper() # Train Liblinear model liblinear_results = benchmark(LinearSVC(loss='l2', penalty=penalty, C=1000, dual=False, tol=1e-3)) # Train SGD model sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)) # Train SGD with Elastic Net penalty print 80 * '=' print "Elastic-Net penalty" sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")) # Train sparse Naive Bayes classifiers print 80 * '=' print "Naive Bayes" mnnb_results = benchmark(MultinomialNB(alpha=.01))
categories = ['HUM', 'LOC', 'NUM', 'ENTY', 'DESC', 'ABBR'] train = load_files('coarse/', categories=categories, shuffle=True, random_state=42) # save train pickle filehandler = open('pickle_training_coarse.pkl', 'wb') pickle.dump(train, filehandler) filehandler.close() text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LinearSVC()), ]) _ = text_clf.fit(train.data, train.target) # save text_clf pickle filehandler = open('pickle_clf_coarse.pkl', 'wb') pickle.dump(text_clf, filehandler) filehandler.close() #new = ['Where is the Amazon river located?', # 'Where can I get a good sandwhich', # 'In what state was Columbus born?', # 'What is the best cheese?'] text = """
def find_best_lsvc(**params): parameters = {'C': [0.01, 0.1, 1, 10, 100, 1000]} return GridSearchCV(LinearSVC(**params), parameters)
for i, n in enumerate((2, 3, 4, 5, 6)): for j, suffix in enumerate(('', '$')): for k, binarize in enumerate((True, False)): print "%d-%d-%d out of 411" % (i, j, k) X_sg_p, v_sg = preprocess.preprocess_data(X_sg, suffix=suffix, n=n, return_vect=True, binarize=binarize) X_pl_p, v_pl = preprocess.preprocess_data(X_pl, suffix=suffix, n=n, return_vect=True, binarize=binarize) grid1 = GridSearchCV(estimator=LinearSVC(), n_jobs=-1, verbose=True, param_grid={'C': np.logspace(-2, 2, 5)}, cv=KFold(len(X_sg), k=10, indices=True)) grid1.fit(X_sg_p, y_sg) scores_sg[i, j, k] = grid1.best_score best_C_sg = grid1.best_estimator.C clf = grid1.best_estimator X_sg_n_p = v_sg.transform(X_sg_n) y_sg_n = clf.predict(X_sg_n_p) predict_sg[i, j, k] = (y_sg_n == 0).mean() grid2 = GridSearchCV(estimator=LinearSVC(), n_jobs=-1,
return unicode_content.lower() def __repr__(self): return "LowerCasePreprocessor()" analyzer1 = CharNGramAnalyzer( min_n=1, max_n=3, preprocessor=LowerCasePreprocessor(), ) # Build a vectorizer / classifier pipeline using the previous analyzer clf = Pipeline([ ('vec', CountVectorizer(analyzer=analyzer1)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)), ]) # Fit the pipeline on the training set clf.fit(twenty_train.data,twenty_train.target) # Predict the outcome on the testing set y_predicted = clf.predict(doc_test) # Predict the result on some short new sentences: sentences = [ u'This is a language detection test.', u'Ceci est un test de d\xe9tection de la langue.', u'Dies ist ein Test, um die Sprache zu erkennen.', ]