def Train(self,
              colNames,
              nValidation,
              labels,
              values,
              fout=None,
              callback=None):
        '''
    	Train a SVM model using optimized C and Gamma parameters and a training set.
    	'''
        # First make sure the supplied problem is in SVM format
        self.TranslateTrainingSet(labels, values)

        # Perform a grid-search to obtain the C and gamma parameters for C-SVM
        # classification
        if nValidation > 1:
            C, gamma = self.ParameterGridSearch(callback, nValidation)
        else:
            C, gamma = self.ParameterGridSearch(callback)

        # Train the model using the obtained C and gamma parameters to obtain the final classifier
        self.model = Pipeline([
            ('anova',
             feature_selection.SelectPercentile(feature_selection.f_classif,
                                                percentile=self.percentile)),
            ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1))
        ])
        self.model.fit(self.svm_train_values, self.svm_train_labels)
def train_svpipe(trainX, trainY, params):
    """ trains LogisiticRegression model with params
        logreg_C specified by params 
        """
    svpipe = Pipeline([('rbfsvm', SVC())])
    svpipe = svpipe.fit(trainX, trainY, **params)
    return svpipe
Example #3
0
def bench_scikit(X, Y):
    """
    bench with scikit-learn bindings on libsvm
    """
    import scikits.learn
    from scikits.learn.svm import SVC

    gc.collect()

    # start time
    tstart = datetime.now()
    clf = SVC(kernel='rbf')
    clf.fit(X, Y).predict(X)
    delta = (datetime.now() - tstart)
    # stop time

    scikit_results.append(delta.seconds + delta.microseconds / mu_second)
Example #4
0
def bench_scikit(X, Y):
    """
    bench with scikit-learn bindings on libsvm
    """
    import scikits.learn
    from scikits.learn.svm import SVC

    gc.collect()

    # start time
    tstart = datetime.now()
    clf = SVC(kernel='rbf')
    clf.fit(X, Y).predict(X)
    delta = (datetime.now() - tstart)
    # stop time

    scikit_results.append(delta.seconds + delta.microseconds/mu_second)
def test_SVMModelField():
    X = [[0 ,0],[1, 1]]
    y = [0, 1]

    svm = SVM()
    clf = SVC()
    clf.fit(X,y)
    a1 = clf.predict([[2.,2.]])

    #print clf
    #print a1

    svm.classifier = clf
    svm.save(safe=True)

    s = SVM.objects.first()
    #print s.classifier
    a2 = s.classifier.predict([[2., 2.]])
    #print a2

    assert a1 == a2
    def ParameterGridSearch(self, callback=None, nValidation=5):
        '''
        Grid search for the best C and gamma parameters for the RBF Kernel.
        The efficiency of the parameters is evaluated using nValidation-fold
        cross-validation of the training data.
    
        As this process is time consuming and parallelizable, a number of
        threads equal to the number of cores in the computer is used for the
        calculations
        '''
        from scikits.learn.grid_search import GridSearchCV
        from scikits.learn.metrics import precision_score
        from scikits.learn.cross_val import StratifiedKFold
        #
        # XXX: program crashes with >1 worker when running cpa.py
        #      No crash when running from classifier.py. Why?
        #
        n_workers = 1
        #try:
        #from multiprocessing import cpu_count
        #n_workers = cpu_count()
        #except:
        #n_workers = 1

        # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting
        parameters = {
            'C': 2**np.arange(-5, 11, 2, dtype=float),
            'gamma': 2**np.arange(3, -11, -2, dtype=float)
        }
        clf = GridSearchCV(SVC(kernel='rbf'),
                           parameters,
                           n_jobs=n_workers,
                           score_func=precision_score)
        clf.fit(self.svm_train_values,
                self.svm_train_labels,
                cv=StratifiedKFold(self.svm_train_labels, nValidation))

        # Pick the best parameters as the ones with the maximum cross-validation rate
        bestParameters = max(clf.grid_scores_, key=lambda a: a[1])
        bestC = bestParameters[0]['C']
        bestGamma = bestParameters[0]['gamma']
        logging.info('Optimal values: C=%s g=%s rate=%s' %
                     (bestC, bestGamma, bestParameters[1]))
        return bestC, bestGamma
def do_grid_search(X, Y, gs_params=None):
    """ Given data (X,Y) will perform a grid search on g_params
        for a LogisticRegression called logreg
        """
    svpipe = Pipeline([('rbfsvm', SVC())])
    if not gs_params:
        gs_params = {
            'rbfsvm__C': (1.5, 2, 5, 10, 20),
            'rbfsvm__gamma': (0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 5),
        }
    gs = GridSearchCV(svpipe, gs_params, n_jobs=-1)
    #print gs
    gs = gs.fit(X, Y)

    best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1])
    logger.info("best_parameters: " + str(best_parameters))
    logger.info("expected score: " + str(score))

    return best_parameters
##############################################################################
# Loading a dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target
n_classes = np.unique(y).size

# Some noisy data not correlated
random = np.random.RandomState(seed=0)
E = random.normal(size=(len(X), 2200))

# Add noisy data to the informative features for make the task harder
X = np.c_[X, E]

svm = SVC(kernel='linear')
cv = StratifiedKFold(y, 2)

score, permutation_scores, pvalue = permutation_test_score(svm,
                                                           X,
                                                           y,
                                                           zero_one_score,
                                                           cv=cv,
                                                           n_permutations=100,
                                                           n_jobs=1)

print "Classification score %s (pvalue : %s)" % (score, pvalue)

###############################################################################
# View histogram of permutation scores
pl.hist(permutation_scores, label='Permutation scores')
Example #9
0
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
from __future__ import division
import os
import logging
import pickle
import numpy as np
from scikits.learn.svm import SVC
from string import punctuation
from operator import itemgetter

logging.basicConfig(level=logging.DEBUG)
lab_train, vec_train, lab_test, vec_test = [
    pickle.load(open(file)) for file in [
        'labels_training.pik', 'vectors_training.pik', 'labels_test.pik',
        'vectors_test.pik'
    ]
]
logging.info("Data loaded")

cat_train = list(set(lab_train))
cat_test = list(set(lab_test))
assert cat_test == cat_train

lab_train = [cat_train.index(l) for l in lab_train]
lab_test = [cat_test.index(l) for l in lab_test]

clf = SVC(kernel='rbf')
clf.fit(vec_train, lab_train)

pickle.dump(clf, open('classifier.pik', 'wb'))
Example #10
0
from scikits.learn import datasets

iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features for visualization
y = iris.target

n_features = X.shape[1]

C = 1.0

# Create different classifiers. The logistic regression cannot do
# multiclass out of the box.
classifiers = {
                'L1 logistic': LogisticRegression(C=C, penalty='l1'),
                'L2 logistic': LogisticRegression(C=C, penalty='l2'),
                'Linear SVC': SVC(kernel='linear', C=C, probability=True),
              }

n_classifiers = len(classifiers)

pl.figure(figsize=(3*2, n_classifiers*2))
pl.subplots_adjust(bottom=.2, top=.95)

for index, (name, classifier) in enumerate(classifiers.iteritems()):
    classifier.fit(X, y)

    y_pred = classifier.predict(X)
    classif_rate = np.mean(y_pred.ravel() == y.ravel()) * 100
    print  "classif_rate for %s : %f " % (name, classif_rate)

    # View probabilities=
print "Extracting the top %d eigenfaces" % n_components
pca = PCA(n_comp=n_components, do_fast_svd=True).fit(X_train)

eigenfaces = pca.components_.T.reshape((n_components, 64, 64))

# project the input data on the eigenfaces orthonormal basis
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)


################################################################################
# Train a SVM classification model

print "Fitting the classifier to the training set"
clf = SVC(C=100).fit(X_train_pca, y_train, class_weight="auto")


################################################################################
# Quantitative evaluation of the model quality on the test set

y_pred = clf.predict(X_test_pca)
print classification_report(y_test, y_pred, labels=selected_target,
                            class_names=category_names[selected_target])

print confusion_matrix(y_test, y_pred, labels=selected_target)


################################################################################
# Qualitative evaluation of the predictions using matplotlib
from scikits.learn.svm import SVC
from string import punctuation
from operator import itemgetter

logging.basicConfig(level=logging.DEBUG)
lab_train, vec_train , lab_test, vec_test = [pickle.load(open(file)) 
                                             for file 
                                             in ['labels_training.pik',
                                                 'vectors_training.pik',
                                                 'labels_test.pik',
                                                 'vectors_test.pik']]
logging.info("Data loaded") 

cat_train = list(set(lab_train))
cat_test  = list(set(lab_test))
assert cat_test == cat_train 

lab_train = [cat_train.index(l) for l in lab_train]
lab_test  = [cat_test.index(l) for l in lab_test]

clf = SVC(kernel='rbf')
clf.fit(vec_train, lab_train)

pickle.dump(clf,open('classifier.pik','wb'))






Example #13
0
print "Projecting the input data on the eigenfaces orthonormal basis"
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print "done in %0.3fs" % (time() - t0)

################################################################################
# Train a SVM classification model

print "Fitting the classifier to the training set"
t0 = time()
param_grid = {
    'C': [1, 5, 10, 50, 100],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
clf = GridSearchCV(SVC(kernel='rbf'),
                   param_grid,
                   fit_params={'class_weight': 'auto'})
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator

################################################################################
# Quantitative evaluation of the model quality on the test set

print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
print "done in %0.3fs" % (time() - t0)
Example #14
0
X_train, X_test = X[train], X[test]
y_train, y_test = y[train], y[test]

# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 150
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
eigenfaces = pca.components_.reshape((n_components, h, w))

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

# Train a SVM classification model
param_grid = dict(C=[1, 5, 10, 50, 100],
                  gamma=[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1])
clf = GridSearchCV(SVC(kernel='rbf'), param_grid,
                   fit_params={'class_weight': 'auto'},
                   verbose=1)
clf = clf.fit(X_train_pca, y_train)
print clf.best_estimator

# Quantitative evaluation of the model quality on the test set
from scikits.learn import metrics
y_pred = clf.predict(X_test_pca)
print metrics.classification_report(y_test, y_pred, target_names=target_names)
print metrics.confusion_matrix(y_test, y_pred,
                               labels=range(len(target_names)))


# Plot the results
import pylab as pl
Example #15
0
tuned_parameters = [{
    'kernel': ['rbf'],
    'gamma': [1e-3, 1e-4],
    'C': [1, 10, 100, 1000]
}, {
    'kernel': ['linear'],
    'C': [1, 10, 100, 1000]
}]

scores = [
    ('precision', precision_score),
    ('recall', recall_score),
]

for score_name, score_func in scores:
    clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func)
    clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5))
    y_true, y_pred = y[test], clf.predict(X[test])

    print "Classification report for the best estimator: "
    print clf.best_estimator
    print "Tuned for '%s' with optimal value: %0.3f" % (
        score_name, score_func(y_true, y_pred))
    print classification_report(y_true, y_pred)
    print "Grid scores:"
    pprint(clf.grid_scores_)
    print

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality
Example #16
0
File: 2.py Project: Yinhai/HandReco
import numpy as np
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
y = np.array([1, 1, 2, 2])
from scikits.learn.svm import SVC
clf = SVC()
clf.fit(X, y)
print clf.predict([[-0.8, -1]])
    def XValidate(self, nPermutations):
        # Make sure all data is available in the training set
        if not self.classifier.UpdateTrainingSet():
            return

        # Initialize process dialog
        def cb(frac):
            cont, skip = dlg.Update(int(frac * 100.),
                                    '%d%% Complete' % (frac * 100.))
            if not cont:  # Cancel was pressed
                dlg.Destroy()
                raise StopCalculating()

        dlg = wx.ProgressDialog(
            'Performing grid search for optimal parameters...', '0% Complete',
            100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME
            | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)

        # Define cross validation parameters
        totalGroups = 5
        trainingGroups = 4

        # Convert the training set into SVM format and search for optimal parameters
        # C and gamma using 5-fold cross-validation
        logging.info(
            'Performing grid search for parameters C and gamma on entire training set...'
        )
        self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix,
                                  self.classifier.trainingSet.values)
        C, gamma = self.ParameterGridSearch(callback=cb)
        dlg.Destroy()
        logging.info(
            'Grid search completed. Found optimal C=%d and gamma=%f.' %
            (C, gamma))

        # Create the classifier and initialize misclassification storage
        classifier = Pipeline([
            ('anova',
             feature_selection.SelectPercentile(feature_selection.f_classif,
                                                percentile=self.percentile)),
            ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1))
        ])
        nObjects = self.classifier.trainingSet.label_matrix.shape[0]
        subsetSize = np.ceil(nObjects / float(totalGroups))
        indices = np.arange(nObjects)
        misclassifications = [[] for i in range(nObjects)]

        # Create group combinations and arrays of all labels and values
        dt = ','.join('i' * trainingGroups)
        trainingTotalGroups = list(
            np.fromiter(combinations(range(totalGroups), trainingGroups),
                        dtype=dt,
                        count=-1))
        #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups))
        allLabels = np.array(self.svm_train_labels)
        allValues = np.array(self.svm_train_values)

        # For all permutations of the subsets train the classifier on 4 totalGroups and
        # classify the remaining group for a number of random subsets
        logging.info('Calculating average classification accuracy %d times over a ' \
                     '%0.1f%%/%0.1f%% cross-validation process' % \
                     (nPermutations, trainingGroups/float(totalGroups)*100, \
                     (1-trainingGroups/float(totalGroups))*100))
        dlg = wx.ProgressDialog(
            'Calculating average cross-validation accuracy...', '0% Complete',
            100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME
            | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)
        nTrainingTotalGroups = len(trainingTotalGroups)
        nOperations = float(nPermutations * nTrainingTotalGroups)
        for per in range(nPermutations):
            # Split the training set into subsets
            np.random.shuffle(indices)
            lastGroupStart = (totalGroups - 1) * subsetSize
            subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups - 1))
            subsets.append(indices[lastGroupStart:], )

            for index, group in enumerate(trainingTotalGroups):
                # Retrieve indices of all objects in the training set
                trainingSet = np.hstack(
                    [subsets[i] for i in range(totalGroups) if i in group])

                # Train a classifier on the subset
                classifier.fit(allValues[trainingSet], allLabels[trainingSet])

                # Predict the test set using the trained classifier
                testSet = np.hstack(
                    [subsets[i] for i in range(totalGroups) if i not in group])
                testLabels = classifier.predict(allValues[testSet])

                # Store all misclassifications
                [misclassifications[testSet[i]].append(testLabels[i]) \
                    for i in range(len(testLabels)) \
                    if testLabels[i] != allLabels[testSet][i]]

                # Update progress dialog
                cb((nTrainingTotalGroups * per + index) / nOperations)

        # Calculate average classification accuracy
        dlg.Destroy()
        logging.info('Average Classification Accuracy: %f%%' % \
                     ((1-len([item for sublist in misclassifications for item in sublist]) /\
                     float(nObjects * nPermutations))*100))

        return misclassifications
################################################################################
# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

################################################################################
# Create the RFE object and compute a cross-validated score, compared to an
# unvariate feature selection

<<<<<<< HEAD
rfe = RFE(estimator = SVC(kernel="linear",C=1), n_features = 10, percentage =
0.1)
anova_filter = UnivariateFilter(SelectKBest(k=10), f_classif)
clf = SVC(kernel="linear",C=1)

y_pred_rfe = []
y_pred_univ = []
y_true = []
for train, test in StratifiedKFold(y, 2):
    Xtrain, ytrain, Xtest, ytest = X[train], y[train], X[test], y[test]

    ### Fit and predict rfe
    support = rfe.fit(X[train], y[train]).support_
    y_pred_rfe.append(clf.fit(X[train,support],y[train]).predict(
          X[test,support]))
Example #19
0
from scikits.learn.svm import SVC
from scikits.learn import datasets
from scikits.learn.feature_selection import RFE

################################################################################
# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

################################################################################
# Create the RFE object and compute a cross-validated score

svc = SVC(kernel="linear", C=1)
rfe = RFE(estimator=svc, n_features=1, percentage=0.1)
rfe.fit(X, y)

image_ranking_ = rfe.ranking_.reshape(digits.images[0].shape)

import pylab as pl

pl.matshow(image_ranking_)
pl.colorbar()
pl.title('Ranking of pixels with RFE')
pl.show()
Example #20
0
            delayed(fit_grid_point)(X, y, klass, orignal_params, clf_params,
                                    cv, self.loss_func, **self.fit_params)
            for clf_params in grid)

        # Out is a list of pairs: estimator, score
        key = lambda pair: pair[1]
        best_estimator = min(out, key=key)[0]

        self.best_estimator = best_estimator
        self.predict = best_estimator.predict

        return self


if __name__ == '__main__':
    from scikits.learn.svm import SVC
    from scikits.learn import datasets
    iris = datasets.load_iris()

    # Add the noisy data to the informative features
    X = iris.data
    y = iris.target

    svc = SVC(kernel='linear')

    def loss_func(y1, y2):
        return np.mean(y1 != y2)

    clf = GridSearchCV(svc, {'C': [1, 10]}, loss_func, n_jobs=2)
    print clf.fit(X, y).predict([[-0.8, -1]])

################################################################################
# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

################################################################################
# Create the RFE object and compute a cross-validated score, compared to an
# unvariate feature selection
svc = SVC(kernel="linear", C=1)
anova_filter = UnivariateFilter(SelectKBest(k=10), f_classif)
clf = SVC(kernel="linear",C=1)
<<<<<<< REMOTE

=======
y_pred_rfe = []
>>>>>>> LOCAL
<<<<<<< REMOTE
import pylab as pl
=======
y_pred_univ = []
>>>>>>> LOCAL
<<<<<<< REMOTE
pl.matshow(image_support_)
=======