def Train(self,
              colNames,
              nValidation,
              labels,
              values,
              fout=None,
              callback=None):
        '''
    	Train a SVM model using optimized C and Gamma parameters and a training set.
    	'''
        # First make sure the supplied problem is in SVM format
        self.TranslateTrainingSet(labels, values)

        # Perform a grid-search to obtain the C and gamma parameters for C-SVM
        # classification
        if nValidation > 1:
            C, gamma = self.ParameterGridSearch(callback, nValidation)
        else:
            C, gamma = self.ParameterGridSearch(callback)

        # Train the model using the obtained C and gamma parameters to obtain the final classifier
        self.model = Pipeline([
            ('anova',
             feature_selection.SelectPercentile(feature_selection.f_classif,
                                                percentile=self.percentile)),
            ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1))
        ])
        self.model.fit(self.svm_train_values, self.svm_train_labels)
Ejemplo n.º 2
0
import pylab as pl
from scikits.learn import svm, datasets, feature_selection, cross_val
from scikits.learn.pipeline import Pipeline

################################################################################
# Import some data to play with
digits = datasets.load_digits()
y = digits.target
n_samples = len(y)
X = digits.data.reshape((n_samples, -1))

################################################################################
# Create a feature-selection transform and an instance of SVM that we
# combine together to have an full-blown estimator

transform = feature_selection.SelectPercentile(feature_selection.f_classif)

clf = Pipeline([transform], svm.SVC())

################################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (10, 20, 30, 40, 50, 60, 70, 80, 90, 100)

for percentile in percentiles:
    transform._set_params(percentile=percentile)
    this_scores = cross_val.cross_val_score(clf, X, y)
    score_means.append(this_scores.mean())
    score_stds.append(this_scores.std())
ward = WardAgglomeration(n_clusters=10,
                         connectivity=A,
                         memory=mem,
                         n_components=1)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
parameters = {'ward__n_clusters': [10, 20, 30]}
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, parameters, n_jobs=1)
clf.fit(X, y, cv=cv)  # set the best parameters
coef_ = clf.best_estimator.steps[-1][1].coef_
coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

# Anova univariate feature selection followed by BayesianRidge
f_regression = mem.cache(feature_selection.f_regression)  # caching function
anova = feature_selection.SelectPercentile(f_regression)
clf = Pipeline([('anova', anova), ('ridge', ridge)])
parameters = {'anova__percentile': [5, 10, 20]}
# Select the optimal percentage of features with grid search
clf = GridSearchCV(clf, parameters)
clf.fit(X, y, cv=cv)  # set the best parameters
coef_ = clf.best_estimator.steps[-1][1].coef_
coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_)
coef_selection_ = coef_.reshape(size, size)

###############################################################################
# Inverse the transformation to plot the results on an image
pl.close('all')
pl.figure(figsize=(7.3, 2.7))
pl.subplot(1, 3, 1)
pl.imshow(coef, interpolation="nearest", cmap=pl.cm.RdBu_r)
    def XValidate(self, nPermutations):
        # Make sure all data is available in the training set
        if not self.classifier.UpdateTrainingSet():
            return

        # Initialize process dialog
        def cb(frac):
            cont, skip = dlg.Update(int(frac * 100.),
                                    '%d%% Complete' % (frac * 100.))
            if not cont:  # Cancel was pressed
                dlg.Destroy()
                raise StopCalculating()

        dlg = wx.ProgressDialog(
            'Performing grid search for optimal parameters...', '0% Complete',
            100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME
            | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)

        # Define cross validation parameters
        totalGroups = 5
        trainingGroups = 4

        # Convert the training set into SVM format and search for optimal parameters
        # C and gamma using 5-fold cross-validation
        logging.info(
            'Performing grid search for parameters C and gamma on entire training set...'
        )
        self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix,
                                  self.classifier.trainingSet.values)
        C, gamma = self.ParameterGridSearch(callback=cb)
        dlg.Destroy()
        logging.info(
            'Grid search completed. Found optimal C=%d and gamma=%f.' %
            (C, gamma))

        # Create the classifier and initialize misclassification storage
        classifier = Pipeline([
            ('anova',
             feature_selection.SelectPercentile(feature_selection.f_classif,
                                                percentile=self.percentile)),
            ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1))
        ])
        nObjects = self.classifier.trainingSet.label_matrix.shape[0]
        subsetSize = np.ceil(nObjects / float(totalGroups))
        indices = np.arange(nObjects)
        misclassifications = [[] for i in range(nObjects)]

        # Create group combinations and arrays of all labels and values
        dt = ','.join('i' * trainingGroups)
        trainingTotalGroups = list(
            np.fromiter(combinations(range(totalGroups), trainingGroups),
                        dtype=dt,
                        count=-1))
        #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups))
        allLabels = np.array(self.svm_train_labels)
        allValues = np.array(self.svm_train_values)

        # For all permutations of the subsets train the classifier on 4 totalGroups and
        # classify the remaining group for a number of random subsets
        logging.info('Calculating average classification accuracy %d times over a ' \
                     '%0.1f%%/%0.1f%% cross-validation process' % \
                     (nPermutations, trainingGroups/float(totalGroups)*100, \
                     (1-trainingGroups/float(totalGroups))*100))
        dlg = wx.ProgressDialog(
            'Calculating average cross-validation accuracy...', '0% Complete',
            100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME
            | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)
        nTrainingTotalGroups = len(trainingTotalGroups)
        nOperations = float(nPermutations * nTrainingTotalGroups)
        for per in range(nPermutations):
            # Split the training set into subsets
            np.random.shuffle(indices)
            lastGroupStart = (totalGroups - 1) * subsetSize
            subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups - 1))
            subsets.append(indices[lastGroupStart:], )

            for index, group in enumerate(trainingTotalGroups):
                # Retrieve indices of all objects in the training set
                trainingSet = np.hstack(
                    [subsets[i] for i in range(totalGroups) if i in group])

                # Train a classifier on the subset
                classifier.fit(allValues[trainingSet], allLabels[trainingSet])

                # Predict the test set using the trained classifier
                testSet = np.hstack(
                    [subsets[i] for i in range(totalGroups) if i not in group])
                testLabels = classifier.predict(allValues[testSet])

                # Store all misclassifications
                [misclassifications[testSet[i]].append(testLabels[i]) \
                    for i in range(len(testLabels)) \
                    if testLabels[i] != allLabels[testSet][i]]

                # Update progress dialog
                cb((nTrainingTotalGroups * per + index) / nOperations)

        # Calculate average classification accuracy
        dlg.Destroy()
        logging.info('Average Classification Accuracy: %f%%' % \
                     ((1-len([item for sublist in misclassifications for item in sublist]) /\
                     float(nObjects * nPermutations))*100))

        return misclassifications