max_n=3, preprocessor=LowerCasePreprocessor(), ) # Build a vectorizer / classifier pipeline using the previous analyzer clf = Pipeline([ ('vec', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)), ]) # Fit the pipeline on the training set clf.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test) # Print the classification report print metrics.classification_report(y_test, y_predicted, class_names=dataset.target_names) # Plot the confusion matrix cm = metrics.confusion_matrix(y_test, y_predicted) print cm # import pylab as pl #pl.matshow(cm) #pl.show() # Predict the result on some short new sentences: sentences = [
""" ================== Pipeline Anova SVM ================== Simple usage of Pipeline that runs successively a univariate feature selection with anova and then a C-SVM of the selected features. """ print __doc__ from scikits.learn import svm from scikits.learn.datasets import samples_generator from scikits.learn.feature_selection import SelectKBest, f_regression from scikits.learn.pipeline import Pipeline # import some data to play with X, y = samples_generator.make_classification( n_features=20, n_informative=3, n_redundant=0, n_classes=4, n_clusters_per_class=2) # ANOVA SVM-C # 1) anova filter, take 3 best ranked features anova_filter = SelectKBest(f_regression, k=3) # 2) svm clf = svm.SVC(kernel='linear') anova_svm = Pipeline([('anova', anova_filter), ('svm', clf)]) anova_svm.fit(X, y) anova_svm.predict(X)
def XValidate(self, nPermutations): # Make sure all data is available in the training set if not self.classifier.UpdateTrainingSet(): return # Initialize process dialog def cb(frac): cont, skip = dlg.Update(int(frac * 100.), '%d%% Complete'%(frac * 100.)) if not cont: # Cancel was pressed dlg.Destroy() raise StopCalculating() dlg = wx.ProgressDialog('Performing grid search for optimal parameters...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) # Define cross validation parameters totalGroups = 5 trainingGroups = 4 # Convert the training set into SVM format and search for optimal parameters # C and gamma using 5-fold cross-validation logging.info('Performing grid search for parameters C and gamma on entire training set...') self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix, self.classifier.trainingSet.values) C, gamma = self.ParameterGridSearch(callback=cb) dlg.Destroy() logging.info('Grid search completed. Found optimal C=%d and gamma=%f.' % (C, gamma)) # Create the classifier and initialize misclassification storage classifier = Pipeline([('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1))]) nObjects = self.classifier.trainingSet.label_matrix.shape[0] subsetSize = np.ceil(nObjects / float(totalGroups)) indices = np.arange(nObjects) misclassifications = [[] for i in range(nObjects)] # Create group combinations and arrays of all labels and values dt = ','.join('i'*trainingGroups) trainingTotalGroups = list(np.fromiter(combinations(range(totalGroups),trainingGroups), dtype=dt, count=-1)) #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups)) allLabels = np.array(self.svm_train_labels) allValues = np.array(self.svm_train_values) # For all permutations of the subsets train the classifier on 4 totalGroups and # classify the remaining group for a number of random subsets logging.info('Calculating average classification accuracy %d times over a ' \ '%0.1f%%/%0.1f%% cross-validation process' % \ (nPermutations, trainingGroups/float(totalGroups)*100, \ (1-trainingGroups/float(totalGroups))*100)) dlg = wx.ProgressDialog('Calculating average cross-validation accuracy...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) nTrainingTotalGroups = len(trainingTotalGroups) nOperations = float(nPermutations * nTrainingTotalGroups) for per in range(nPermutations): # Split the training set into subsets np.random.shuffle(indices) lastGroupStart = (totalGroups-1)*subsetSize subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups-1)) subsets.append(indices[lastGroupStart:],) for index, group in enumerate(trainingTotalGroups): # Retrieve indices of all objects in the training set trainingSet = np.hstack([subsets[i] for i in range(totalGroups) if i in group]) # Train a classifier on the subset classifier.fit(allValues[trainingSet], allLabels[trainingSet]) # Predict the test set using the trained classifier testSet = np.hstack([subsets[i] for i in range(totalGroups) if i not in group]) testLabels = classifier.predict(allValues[testSet]) # Store all misclassifications [misclassifications[testSet[i]].append(testLabels[i]) \ for i in range(len(testLabels)) \ if testLabels[i] != allLabels[testSet][i]] # Update progress dialog cb((nTrainingTotalGroups * per + index) / nOperations) # Calculate average classification accuracy dlg.Destroy() logging.info('Average Classification Accuracy: %f%%' % \ ((1-len([item for sublist in misclassifications for item in sublist]) /\ float(nObjects * nPermutations))*100)) return misclassifications
max_n=3, preprocessor=LowerCasePreprocessor(), ) # Build a vectorizer / classifier pipeline using the previous analyzer clf = Pipeline([ ('vec', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)), ]) # Fit the pipeline on the training set clf.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test) # Print the classification report print metrics.classification_report(y_test, y_predicted, class_names=dataset.target_names) # Plot the confusion matrix cm = metrics.confusion_matrix(y_test, y_predicted) print cm # import pylab as pl #pl.matshow(cm) #pl.show() # Predict the result on some short new sentences:
""" ================== Pipeline Anova SVM ================== Simple usage of Pipeline that runs successively a univariate feature selection with anova and then a C-SVM of the selected features. """ print __doc__ from scikits.learn import svm from scikits.learn.datasets import samples_generator from scikits.learn.feature_selection import SelectKBest, f_regression from scikits.learn.pipeline import Pipeline # import some data to play with X, y = samples_generator.test_dataset_classif(k=5) # ANOVA SVM-C # 1) anova filter, take 5 best ranked features anova_filter = SelectKBest(f_regression, k=5) # 2) svm clf = svm.SVC(kernel='linear') anova_svm = Pipeline([('anova', anova_filter), ('svm', clf)]) anova_svm.fit(X, y) anova_svm.predict(X)
def XValidate(self, nPermutations): # Make sure all data is available in the training set if not self.classifier.UpdateTrainingSet(): return # Initialize process dialog def cb(frac): cont, skip = dlg.Update(int(frac * 100.), '%d%% Complete' % (frac * 100.)) if not cont: # Cancel was pressed dlg.Destroy() raise StopCalculating() dlg = wx.ProgressDialog( 'Performing grid search for optimal parameters...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) # Define cross validation parameters totalGroups = 5 trainingGroups = 4 # Convert the training set into SVM format and search for optimal parameters # C and gamma using 5-fold cross-validation logging.info( 'Performing grid search for parameters C and gamma on entire training set...' ) self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix, self.classifier.trainingSet.values) C, gamma = self.ParameterGridSearch(callback=cb) dlg.Destroy() logging.info( 'Grid search completed. Found optimal C=%d and gamma=%f.' % (C, gamma)) # Create the classifier and initialize misclassification storage classifier = Pipeline([ ('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1)) ]) nObjects = self.classifier.trainingSet.label_matrix.shape[0] subsetSize = np.ceil(nObjects / float(totalGroups)) indices = np.arange(nObjects) misclassifications = [[] for i in range(nObjects)] # Create group combinations and arrays of all labels and values dt = ','.join('i' * trainingGroups) trainingTotalGroups = list( np.fromiter(combinations(range(totalGroups), trainingGroups), dtype=dt, count=-1)) #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups)) allLabels = np.array(self.svm_train_labels) allValues = np.array(self.svm_train_values) # For all permutations of the subsets train the classifier on 4 totalGroups and # classify the remaining group for a number of random subsets logging.info('Calculating average classification accuracy %d times over a ' \ '%0.1f%%/%0.1f%% cross-validation process' % \ (nPermutations, trainingGroups/float(totalGroups)*100, \ (1-trainingGroups/float(totalGroups))*100)) dlg = wx.ProgressDialog( 'Calculating average cross-validation accuracy...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) nTrainingTotalGroups = len(trainingTotalGroups) nOperations = float(nPermutations * nTrainingTotalGroups) for per in range(nPermutations): # Split the training set into subsets np.random.shuffle(indices) lastGroupStart = (totalGroups - 1) * subsetSize subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups - 1)) subsets.append(indices[lastGroupStart:], ) for index, group in enumerate(trainingTotalGroups): # Retrieve indices of all objects in the training set trainingSet = np.hstack( [subsets[i] for i in range(totalGroups) if i in group]) # Train a classifier on the subset classifier.fit(allValues[trainingSet], allLabels[trainingSet]) # Predict the test set using the trained classifier testSet = np.hstack( [subsets[i] for i in range(totalGroups) if i not in group]) testLabels = classifier.predict(allValues[testSet]) # Store all misclassifications [misclassifications[testSet[i]].append(testLabels[i]) \ for i in range(len(testLabels)) \ if testLabels[i] != allLabels[testSet][i]] # Update progress dialog cb((nTrainingTotalGroups * per + index) / nOperations) # Calculate average classification accuracy dlg.Destroy() logging.info('Average Classification Accuracy: %f%%' % \ ((1-len([item for sublist in misclassifications for item in sublist]) /\ float(nObjects * nPermutations))*100)) return misclassifications