def Train(self, colNames, nValidation, labels, values, fout=None, callback=None): ''' Train a SVM model using optimized C and Gamma parameters and a training set. ''' # First make sure the supplied problem is in SVM format self.TranslateTrainingSet(labels, values) # Perform a grid-search to obtain the C and gamma parameters for C-SVM # classification if nValidation > 1: C, gamma = self.ParameterGridSearch(callback, nValidation) else: C, gamma = self.ParameterGridSearch(callback) # Train the model using the obtained C and gamma parameters to obtain the final classifier self.model = Pipeline([ ('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1)) ]) self.model.fit(self.svm_train_values, self.svm_train_labels)
def train_svpipe(trainX, trainY, params): """ trains LogisiticRegression model with params logreg_C specified by params """ svpipe = Pipeline([('rbfsvm', SVC())]) svpipe = svpipe.fit(trainX, trainY, **params) return svpipe
def bench_scikit(X, Y): """ bench with scikit-learn bindings on libsvm """ import scikits.learn from scikits.learn.svm import SVC gc.collect() # start time tstart = datetime.now() clf = SVC(kernel='rbf') clf.fit(X, Y).predict(X) delta = (datetime.now() - tstart) # stop time scikit_results.append(delta.seconds + delta.microseconds / mu_second)
def bench_scikit(X, Y): """ bench with scikit-learn bindings on libsvm """ import scikits.learn from scikits.learn.svm import SVC gc.collect() # start time tstart = datetime.now() clf = SVC(kernel='rbf') clf.fit(X, Y).predict(X) delta = (datetime.now() - tstart) # stop time scikit_results.append(delta.seconds + delta.microseconds/mu_second)
def test_SVMModelField(): X = [[0 ,0],[1, 1]] y = [0, 1] svm = SVM() clf = SVC() clf.fit(X,y) a1 = clf.predict([[2.,2.]]) #print clf #print a1 svm.classifier = clf svm.save(safe=True) s = SVM.objects.first() #print s.classifier a2 = s.classifier.predict([[2., 2.]]) #print a2 assert a1 == a2
def ParameterGridSearch(self, callback=None, nValidation=5): ''' Grid search for the best C and gamma parameters for the RBF Kernel. The efficiency of the parameters is evaluated using nValidation-fold cross-validation of the training data. As this process is time consuming and parallelizable, a number of threads equal to the number of cores in the computer is used for the calculations ''' from scikits.learn.grid_search import GridSearchCV from scikits.learn.metrics import precision_score from scikits.learn.cross_val import StratifiedKFold # # XXX: program crashes with >1 worker when running cpa.py # No crash when running from classifier.py. Why? # n_workers = 1 #try: #from multiprocessing import cpu_count #n_workers = cpu_count() #except: #n_workers = 1 # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting parameters = { 'C': 2**np.arange(-5, 11, 2, dtype=float), 'gamma': 2**np.arange(3, -11, -2, dtype=float) } clf = GridSearchCV(SVC(kernel='rbf'), parameters, n_jobs=n_workers, score_func=precision_score) clf.fit(self.svm_train_values, self.svm_train_labels, cv=StratifiedKFold(self.svm_train_labels, nValidation)) # Pick the best parameters as the ones with the maximum cross-validation rate bestParameters = max(clf.grid_scores_, key=lambda a: a[1]) bestC = bestParameters[0]['C'] bestGamma = bestParameters[0]['gamma'] logging.info('Optimal values: C=%s g=%s rate=%s' % (bestC, bestGamma, bestParameters[1])) return bestC, bestGamma
def do_grid_search(X, Y, gs_params=None): """ Given data (X,Y) will perform a grid search on g_params for a LogisticRegression called logreg """ svpipe = Pipeline([('rbfsvm', SVC())]) if not gs_params: gs_params = { 'rbfsvm__C': (1.5, 2, 5, 10, 20), 'rbfsvm__gamma': (0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 5), } gs = GridSearchCV(svpipe, gs_params, n_jobs=-1) #print gs gs = gs.fit(X, Y) best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1]) logger.info("best_parameters: " + str(best_parameters)) logger.info("expected score: " + str(score)) return best_parameters
############################################################################## # Loading a dataset iris = datasets.load_iris() X = iris.data y = iris.target n_classes = np.unique(y).size # Some noisy data not correlated random = np.random.RandomState(seed=0) E = random.normal(size=(len(X), 2200)) # Add noisy data to the informative features for make the task harder X = np.c_[X, E] svm = SVC(kernel='linear') cv = StratifiedKFold(y, 2) score, permutation_scores, pvalue = permutation_test_score(svm, X, y, zero_one_score, cv=cv, n_permutations=100, n_jobs=1) print "Classification score %s (pvalue : %s)" % (score, pvalue) ############################################################################### # View histogram of permutation scores pl.hist(permutation_scores, label='Permutation scores')
# along with this program. If not, see <http://www.gnu.org/licenses/>. from __future__ import division import os import logging import pickle import numpy as np from scikits.learn.svm import SVC from string import punctuation from operator import itemgetter logging.basicConfig(level=logging.DEBUG) lab_train, vec_train, lab_test, vec_test = [ pickle.load(open(file)) for file in [ 'labels_training.pik', 'vectors_training.pik', 'labels_test.pik', 'vectors_test.pik' ] ] logging.info("Data loaded") cat_train = list(set(lab_train)) cat_test = list(set(lab_test)) assert cat_test == cat_train lab_train = [cat_train.index(l) for l in lab_train] lab_test = [cat_test.index(l) for l in lab_test] clf = SVC(kernel='rbf') clf.fit(vec_train, lab_train) pickle.dump(clf, open('classifier.pik', 'wb'))
from scikits.learn import datasets iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features for visualization y = iris.target n_features = X.shape[1] C = 1.0 # Create different classifiers. The logistic regression cannot do # multiclass out of the box. classifiers = { 'L1 logistic': LogisticRegression(C=C, penalty='l1'), 'L2 logistic': LogisticRegression(C=C, penalty='l2'), 'Linear SVC': SVC(kernel='linear', C=C, probability=True), } n_classifiers = len(classifiers) pl.figure(figsize=(3*2, n_classifiers*2)) pl.subplots_adjust(bottom=.2, top=.95) for index, (name, classifier) in enumerate(classifiers.iteritems()): classifier.fit(X, y) y_pred = classifier.predict(X) classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100 print "classif_rate for %s : %f " % (name, classif_rate) # View probabilities=
print "Extracting the top %d eigenfaces" % n_components pca = PCA(n_comp=n_components, do_fast_svd=True).fit(X_train) eigenfaces = pca.components_.T.reshape((n_components, 64, 64)) # project the input data on the eigenfaces orthonormal basis X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) ################################################################################ # Train a SVM classification model print "Fitting the classifier to the training set" clf = SVC(C=100).fit(X_train_pca, y_train, class_weight="auto") ################################################################################ # Quantitative evaluation of the model quality on the test set y_pred = clf.predict(X_test_pca) print classification_report(y_test, y_pred, labels=selected_target, class_names=category_names[selected_target]) print confusion_matrix(y_test, y_pred, labels=selected_target) ################################################################################ # Qualitative evaluation of the predictions using matplotlib
from scikits.learn.svm import SVC from string import punctuation from operator import itemgetter logging.basicConfig(level=logging.DEBUG) lab_train, vec_train , lab_test, vec_test = [pickle.load(open(file)) for file in ['labels_training.pik', 'vectors_training.pik', 'labels_test.pik', 'vectors_test.pik']] logging.info("Data loaded") cat_train = list(set(lab_train)) cat_test = list(set(lab_test)) assert cat_test == cat_train lab_train = [cat_train.index(l) for l in lab_train] lab_test = [cat_test.index(l) for l in lab_test] clf = SVC(kernel='rbf') clf.fit(vec_train, lab_train) pickle.dump(clf,open('classifier.pik','wb'))
print "Projecting the input data on the eigenfaces orthonormal basis" t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ################################################################################ # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1, 5, 10, 50, 100], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator ################################################################################ # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0)
X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) eigenfaces = pca.components_.reshape((n_components, h, w)) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # Train a SVM classification model param_grid = dict(C=[1, 5, 10, 50, 100], gamma=[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]) clf = GridSearchCV(SVC(kernel='rbf'), param_grid, fit_params={'class_weight': 'auto'}, verbose=1) clf = clf.fit(X_train_pca, y_train) print clf.best_estimator # Quantitative evaluation of the model quality on the test set from scikits.learn import metrics y_pred = clf.predict(X_test_pca) print metrics.classification_report(y_test, y_pred, target_names=target_names) print metrics.confusion_matrix(y_test, y_pred, labels=range(len(target_names))) # Plot the results import pylab as pl
tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] scores = [ ('precision', precision_score), ('recall', recall_score), ] for score_name, score_func in scores: clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func) clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5)) y_true, y_pred = y[test], clf.predict(X[test]) print "Classification report for the best estimator: " print clf.best_estimator print "Tuned for '%s' with optimal value: %0.3f" % ( score_name, score_func(y_true, y_pred)) print classification_report(y_true, y_pred) print "Grid scores:" pprint(clf.grid_scores_) print # Note the problem is too easy: the hyperparameter plateau is too flat and the # output model is the same for precision and recall with ties in quality
import numpy as np X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) y = np.array([1, 1, 2, 2]) from scikits.learn.svm import SVC clf = SVC() clf.fit(X, y) print clf.predict([[-0.8, -1]])
def XValidate(self, nPermutations): # Make sure all data is available in the training set if not self.classifier.UpdateTrainingSet(): return # Initialize process dialog def cb(frac): cont, skip = dlg.Update(int(frac * 100.), '%d%% Complete' % (frac * 100.)) if not cont: # Cancel was pressed dlg.Destroy() raise StopCalculating() dlg = wx.ProgressDialog( 'Performing grid search for optimal parameters...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) # Define cross validation parameters totalGroups = 5 trainingGroups = 4 # Convert the training set into SVM format and search for optimal parameters # C and gamma using 5-fold cross-validation logging.info( 'Performing grid search for parameters C and gamma on entire training set...' ) self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix, self.classifier.trainingSet.values) C, gamma = self.ParameterGridSearch(callback=cb) dlg.Destroy() logging.info( 'Grid search completed. Found optimal C=%d and gamma=%f.' % (C, gamma)) # Create the classifier and initialize misclassification storage classifier = Pipeline([ ('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1)) ]) nObjects = self.classifier.trainingSet.label_matrix.shape[0] subsetSize = np.ceil(nObjects / float(totalGroups)) indices = np.arange(nObjects) misclassifications = [[] for i in range(nObjects)] # Create group combinations and arrays of all labels and values dt = ','.join('i' * trainingGroups) trainingTotalGroups = list( np.fromiter(combinations(range(totalGroups), trainingGroups), dtype=dt, count=-1)) #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups)) allLabels = np.array(self.svm_train_labels) allValues = np.array(self.svm_train_values) # For all permutations of the subsets train the classifier on 4 totalGroups and # classify the remaining group for a number of random subsets logging.info('Calculating average classification accuracy %d times over a ' \ '%0.1f%%/%0.1f%% cross-validation process' % \ (nPermutations, trainingGroups/float(totalGroups)*100, \ (1-trainingGroups/float(totalGroups))*100)) dlg = wx.ProgressDialog( 'Calculating average cross-validation accuracy...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) nTrainingTotalGroups = len(trainingTotalGroups) nOperations = float(nPermutations * nTrainingTotalGroups) for per in range(nPermutations): # Split the training set into subsets np.random.shuffle(indices) lastGroupStart = (totalGroups - 1) * subsetSize subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups - 1)) subsets.append(indices[lastGroupStart:], ) for index, group in enumerate(trainingTotalGroups): # Retrieve indices of all objects in the training set trainingSet = np.hstack( [subsets[i] for i in range(totalGroups) if i in group]) # Train a classifier on the subset classifier.fit(allValues[trainingSet], allLabels[trainingSet]) # Predict the test set using the trained classifier testSet = np.hstack( [subsets[i] for i in range(totalGroups) if i not in group]) testLabels = classifier.predict(allValues[testSet]) # Store all misclassifications [misclassifications[testSet[i]].append(testLabels[i]) \ for i in range(len(testLabels)) \ if testLabels[i] != allLabels[testSet][i]] # Update progress dialog cb((nTrainingTotalGroups * per + index) / nOperations) # Calculate average classification accuracy dlg.Destroy() logging.info('Average Classification Accuracy: %f%%' % \ ((1-len([item for sublist in misclassifications for item in sublist]) /\ float(nObjects * nPermutations))*100)) return misclassifications
################################################################################ # Loading the Digits dataset digits = datasets.load_digits() # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) y = digits.target ################################################################################ # Create the RFE object and compute a cross-validated score, compared to an # unvariate feature selection <<<<<<< HEAD rfe = RFE(estimator = SVC(kernel="linear",C=1), n_features = 10, percentage = 0.1) anova_filter = UnivariateFilter(SelectKBest(k=10), f_classif) clf = SVC(kernel="linear",C=1) y_pred_rfe = [] y_pred_univ = [] y_true = [] for train, test in StratifiedKFold(y, 2): Xtrain, ytrain, Xtest, ytest = X[train], y[train], X[test], y[test] ### Fit and predict rfe support = rfe.fit(X[train], y[train]).support_ y_pred_rfe.append(clf.fit(X[train,support],y[train]).predict( X[test,support]))
from scikits.learn.svm import SVC from scikits.learn import datasets from scikits.learn.feature_selection import RFE ################################################################################ # Loading the Digits dataset digits = datasets.load_digits() # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) y = digits.target ################################################################################ # Create the RFE object and compute a cross-validated score svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features=1, percentage=0.1) rfe.fit(X, y) image_ranking_ = rfe.ranking_.reshape(digits.images[0].shape) import pylab as pl pl.matshow(image_ranking_) pl.colorbar() pl.title('Ranking of pixels with RFE') pl.show()
delayed(fit_grid_point)(X, y, klass, orignal_params, clf_params, cv, self.loss_func, **self.fit_params) for clf_params in grid) # Out is a list of pairs: estimator, score key = lambda pair: pair[1] best_estimator = min(out, key=key)[0] self.best_estimator = best_estimator self.predict = best_estimator.predict return self if __name__ == '__main__': from scikits.learn.svm import SVC from scikits.learn import datasets iris = datasets.load_iris() # Add the noisy data to the informative features X = iris.data y = iris.target svc = SVC(kernel='linear') def loss_func(y1, y2): return np.mean(y1 != y2) clf = GridSearchCV(svc, {'C': [1, 10]}, loss_func, n_jobs=2) print clf.fit(X, y).predict([[-0.8, -1]])
################################################################################ # Loading the Digits dataset digits = datasets.load_digits() # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) y = digits.target ################################################################################ # Create the RFE object and compute a cross-validated score, compared to an # unvariate feature selection svc = SVC(kernel="linear", C=1) anova_filter = UnivariateFilter(SelectKBest(k=10), f_classif) clf = SVC(kernel="linear",C=1) <<<<<<< REMOTE ======= y_pred_rfe = [] >>>>>>> LOCAL <<<<<<< REMOTE import pylab as pl ======= y_pred_univ = [] >>>>>>> LOCAL <<<<<<< REMOTE pl.matshow(image_support_) =======