def train_lrpipe(trainX, trainY, params): """ trains LogisiticRegression model with params logreg_C specified by params """ lrpipe = Pipeline([('logreg', LogisticRegression(penalty="l1", C=1))]) lrpipe = lrpipe.fit(trainX, trainY, **params) return lrpipe
def Train(self, colNames, nValidation, labels, values, fout=None, callback=None): ''' Train a SVM model using optimized C and Gamma parameters and a training set. ''' # First make sure the supplied problem is in SVM format self.TranslateTrainingSet(labels, values) # Perform a grid-search to obtain the C and gamma parameters for C-SVM # classification if nValidation > 1: C, gamma = self.ParameterGridSearch(callback, nValidation) else: C, gamma = self.ParameterGridSearch(callback) # Train the model using the obtained C and gamma parameters to obtain the final classifier self.model = Pipeline([ ('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1)) ]) self.model.fit(self.svm_train_values, self.svm_train_labels)
def train_svpipe(trainX, trainY, params): """ trains LogisiticRegression model with params logreg_C specified by params """ svpipe = Pipeline([('rbfsvm', SVC())]) svpipe = svpipe.fit(trainX, trainY, **params) return svpipe
def test_countvectorizer_custom_vocabulary_pipeline(): what_we_like = ["pizza", "beer"] pipe = Pipeline([ ('count', CountVectorizer(vocabulary=what_we_like)), ('tfidf', TfidfTransformer())]) X = pipe.fit_transform(ALL_FOOD_DOCS) assert_equal(set(pipe.named_steps['count'].vocabulary), set(what_we_like)) assert_equal(X.shape[1], len(what_we_like))
def train_svpipe(trainX, trainY, params ): """ trains LogisiticRegression model with params logreg_C specified by params """ svpipe = Pipeline([ ('rbfsvm', SVC() ) ]) svpipe = svpipe.fit(trainX,trainY, **params) return svpipe
def train(cls, labeled_featuresets): train, target_labels = zip(*labeled_featuresets) target_names = sorted(set(target_labels)) targets = [target_names.index(l) for l in target_labels] pipeline = Pipeline([("bow", BagOfWordsVectorizer()), ("clf", LinearSVC(C=1000))]) pipeline.fit(train, targets) return cls(pipeline, target_names)
def train_lrpipe(trainX, trainY, params ): """ trains LogisiticRegression model with params logreg_C specified by params """ lrpipe = Pipeline([ ('logreg', LogisticRegression(penalty="l1", C=1) ) ]) lrpipe = lrpipe.fit(trainX,trainY, **params) return lrpipe
def train(cls, labeled_featuresets): train, target_labels = zip(*labeled_featuresets) target_names = sorted(set(target_labels)) targets = [target_names.index(l) for l in target_labels] pipeline = Pipeline([ ('bow', BagOfWordsVectorizer()), ('clf', LinearSVC(C=1000)), ]) pipeline.fit(train, targets) return cls(pipeline, target_names)
def test_dense_vectorizer_pipeline_grid_selection(): # raw documents data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS # simulate iterables train_data = iter(data[1:-1]) test_data = iter([data[0], data[-1]]) # label junk food as -1, the others as +1 y = np.ones(len(data)) y[:6] = -1 y_train = y[1:-1] y_test = np.array([y[0], y[-1]]) pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())]) parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')} # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=1) # cross-validation doesn't work if the length of the data is not known, # hence use lists instead of iterators pred = grid_search.fit(list(train_data), y_train).predict(list(test_data)) assert_array_equal(pred, y_test) # on this toy dataset bigram representation which is used in the last of # the grid_search is considered the best estimator since they all converge # to 100% accurracy models assert_equal(grid_search.best_score, 1.0) best_vectorizer = grid_search.best_estimator.named_steps['vect'] assert_equal(best_vectorizer.analyzer.max_n, 1)
def get_clf(n=3, binarize=True): steps = [('vectorizer', CountVectorizer( CharNGramAnalyzer(min_n=1, max_n=n, preprocessor=SimplePreprocessor())))] if binarize: steps.append(('binarizer', Binarizer(copy=False))) steps.append(('clf', naive_bayes.BernoulliNB())) else: steps.append(('clf', naive_bayes.MultinomialNB())) return Pipeline(steps)
def do_grid_search(X, Y, gs_params): """ Given data (X,Y) will perform a grid search on g_params for a LogisticRegression called logreg """ lrpipe = Pipeline([('logreg', LogisticRegression())]) gs = GridSearchCV(lrpipe, gs_params, n_jobs=-1) #print gs gs = gs.fit(X, Y) best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1]) logger.info("best_parameters: " + str(best_parameters)) logger.info("expected score: " + str(score)) return best_parameters
def Train(self, colNames, nValidation, labels, values, fout=None, callback = None): ''' Train a SVM model using optimized C and Gamma parameters and a training set. ''' # First make sure the supplied problem is in SVM format self.TranslateTrainingSet(labels, values) # Perform a grid-search to obtain the C and gamma parameters for C-SVM # classification if nValidation > 1: C, gamma = self.ParameterGridSearch(callback, nValidation) else: C, gamma = self.ParameterGridSearch(callback) # Train the model using the obtained C and gamma parameters to obtain the final classifier self.model = Pipeline([('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1))]) self.model.fit(self.svm_train_values, self.svm_train_labels)
def do_grid_search(X, Y, gs_params=None): """ Given data (X,Y) will perform a grid search on g_params for a LogisticRegression called logreg """ svpipe = Pipeline([('rbfsvm', SVC())]) if not gs_params: gs_params = { 'rbfsvm__C': (1.5, 2, 5, 10, 20), 'rbfsvm__gamma': (0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 5), } gs = GridSearchCV(svpipe, gs_params, n_jobs=-1) #print gs gs = gs.fit(X, Y) best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1]) logger.info("best_parameters: " + str(best_parameters)) logger.info("expected score: " + str(score)) return best_parameters
from scikits.learn.pipeline import Pipeline ################################################################################ # Import some data to play with digits = datasets.load_digits() y = digits.target n_samples = len(y) X = digits.data.reshape((n_samples, -1)) ################################################################################ # Create a feature-selection transform and an instance of SVM that we # combine together to have an full-blown estimator transform = feature_selection.SelectPercentile(feature_selection.f_classif) clf = Pipeline([transform], svm.SVC()) ################################################################################ # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (10, 20, 30, 40, 50, 60, 70, 80, 90, 100) for percentile in percentiles: transform._set_params(percentile=percentile) this_scores = cross_val.cross_val_score(clf, X, y) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) pl.errorbar(percentiles, score_means, np.array(score_stds))
def XValidate(self, nPermutations): # Make sure all data is available in the training set if not self.classifier.UpdateTrainingSet(): return # Initialize process dialog def cb(frac): cont, skip = dlg.Update(int(frac * 100.), '%d%% Complete' % (frac * 100.)) if not cont: # Cancel was pressed dlg.Destroy() raise StopCalculating() dlg = wx.ProgressDialog( 'Performing grid search for optimal parameters...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) # Define cross validation parameters totalGroups = 5 trainingGroups = 4 # Convert the training set into SVM format and search for optimal parameters # C and gamma using 5-fold cross-validation logging.info( 'Performing grid search for parameters C and gamma on entire training set...' ) self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix, self.classifier.trainingSet.values) C, gamma = self.ParameterGridSearch(callback=cb) dlg.Destroy() logging.info( 'Grid search completed. Found optimal C=%d and gamma=%f.' % (C, gamma)) # Create the classifier and initialize misclassification storage classifier = Pipeline([ ('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1)) ]) nObjects = self.classifier.trainingSet.label_matrix.shape[0] subsetSize = np.ceil(nObjects / float(totalGroups)) indices = np.arange(nObjects) misclassifications = [[] for i in range(nObjects)] # Create group combinations and arrays of all labels and values dt = ','.join('i' * trainingGroups) trainingTotalGroups = list( np.fromiter(combinations(range(totalGroups), trainingGroups), dtype=dt, count=-1)) #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups)) allLabels = np.array(self.svm_train_labels) allValues = np.array(self.svm_train_values) # For all permutations of the subsets train the classifier on 4 totalGroups and # classify the remaining group for a number of random subsets logging.info('Calculating average classification accuracy %d times over a ' \ '%0.1f%%/%0.1f%% cross-validation process' % \ (nPermutations, trainingGroups/float(totalGroups)*100, \ (1-trainingGroups/float(totalGroups))*100)) dlg = wx.ProgressDialog( 'Calculating average cross-validation accuracy...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) nTrainingTotalGroups = len(trainingTotalGroups) nOperations = float(nPermutations * nTrainingTotalGroups) for per in range(nPermutations): # Split the training set into subsets np.random.shuffle(indices) lastGroupStart = (totalGroups - 1) * subsetSize subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups - 1)) subsets.append(indices[lastGroupStart:], ) for index, group in enumerate(trainingTotalGroups): # Retrieve indices of all objects in the training set trainingSet = np.hstack( [subsets[i] for i in range(totalGroups) if i in group]) # Train a classifier on the subset classifier.fit(allValues[trainingSet], allLabels[trainingSet]) # Predict the test set using the trained classifier testSet = np.hstack( [subsets[i] for i in range(totalGroups) if i not in group]) testLabels = classifier.predict(allValues[testSet]) # Store all misclassifications [misclassifications[testSet[i]].append(testLabels[i]) \ for i in range(len(testLabels)) \ if testLabels[i] != allLabels[testSet][i]] # Update progress dialog cb((nTrainingTotalGroups * per + index) / nOperations) # Calculate average classification accuracy dlg.Destroy() logging.info('Average Classification Accuracy: %f%%' % \ ((1-len([item for sublist in misclassifications for item in sublist]) /\ float(nObjects * nPermutations))*100)) return misclassifications
class SupportVectorMachines(object): ''' Class to define a complete support vector machine classifier calculation problem. ''' def __init__(self, classifier = None): logging.info('Initialized New Support Vector Machines Classifier') self.model = None self.classBins = [] self.classifier = classifier self.percentile = 90 # Initialize the total object storage self.perClassObjects = {} self.feat_min, self.feat_max = None, None self.svm_train_labels, self.svm_train_values = None, None def CheckProgress(self): # Calculate cross-validation data nPermutations = 10 try: misclassifications = self.XValidate(nPermutations) except StopCalculating: return def confusionMatrix(): # Open confusion matrix confusionMatrix, axes = self.ConfusionMatrix( self.svm_train_labels, [misclassifications[i]+[val]*(nPermutations-len(misclassifications[i])) for i, val in enumerate(self.svm_train_labels)] ) self.classifier.ShowConfusionMatrix(confusionMatrix, axes) def dimensionReduction(): # Initialize PCA/tSNE plot pca_main = dr.PlotMain(self.classifier, properties = Properties.getInstance(), loadData = False) pca_main.set_data(self.classifier.trainingSet.values, dict([(index, object) for index, object in enumerate(self.classifier.trainingSet.get_object_keys())]), np.int64(self.classifier.trainingSet.label_matrix > 0), self.classifier.trainingSet.labels, np.array([len(misclassifications[i])/float(nPermutations) for i in xrange(len(misclassifications))]).round(2)) pca_main.Show(True) # Ask how the user wants to visualize the cross-validation results (either through # a confusion matrix or visually in a dimension reductionality plot) visualizationChoiceBox(self.classifier, -1, 'Pick cross-validation visualization', confusionMatrix, dimensionReduction) def ClearModel(self): # Clear all parameters related to the trained classifier self.classBins = [] self.model = None self.feat_min, self.feat_max = None, None self.svm_train_labels, self.svm_train_values = None, None def ComplexityTxt(self): return '# of cross-validations: ' def ConfusionMatrix(self, actual = None, predicted = None): # Retrieve the number of classes, their labels and initialize # the confusion matrix nClasses = len(self.classBins) confusionMatrix = np.zeros((nClasses, nClasses), np.int64) classLabels = [bin.label for bin in self.classBins] # For each of the objects used to train the classifier, check what class # it was predicted to have been by the classifier if actual is None or predicted is None: for actualClassNum, actualClassObjects in \ enumerate([bin.GetObjectKeys() for bin in self.classBins]): for predictedLabel in [(classLabels[i], i) for i in range(nClasses)]: confusionMatrix[predictedLabel[1], actualClassNum] += \ len([obj for obj in actualClassObjects if \ obj in self.perClassObjects[predictedLabel[0]]]) else: # Generate the confusion matrix for a list of actual and predicted classes for i, actualClass in enumerate(actual): # Count the number of correct classifications and store them in the # confusion matrix actualClass = np.int(actualClass) # Count all misclassifications for j in predicted[i]: confusionMatrix[np.int(j), actualClass] += 1 return confusionMatrix, classLabels def ConvertToSVMFormat(self, labels, values): ''' Convert the training set data to SVM format Format: label feature_1:value feature_2:value feature_3:value ... ''' labels = np.array([np.nonzero(target > 0) for target in labels]).squeeze() return labels, values def CreatePerObjectClassTable(self, classes): ''' Saves object keys and classes to a SQL table ''' p = Properties.getInstance() if p.class_table is None: raise ValueError('"class_table" in properties file is not set.') index_cols = dbconnect.UniqueObjectClause() class_cols = dbconnect.UniqueObjectClause() + ', class, class_number' class_col_defs = dbconnect.object_key_defs() + ', class VARCHAR (%d)'%(max([len(c.label) for c in self.classBins])+1) + ', class_number INT' # Drop must be explicitly asked for Classifier.ScoreAll db = dbconnect.DBConnect.getInstance() db.execute('DROP TABLE IF EXISTS %s'%(p.class_table)) db.execute('CREATE TABLE %s (%s)'%(p.class_table, class_col_defs)) db.execute('CREATE INDEX idx_%s ON %s (%s)'%(p.class_table, p.class_table, index_cols)) for clNum, clName in enumerate(self.perClassObjects.keys()): for obj in self.perClassObjects[clName]: query = ''.join(['INSERT INTO ',p.class_table,' (',class_cols,') VALUES (',str(obj[0]),', ',str(obj[1]),', "',clName,'", ',str(clNum+1),')']) db.execute(query) if p.db_type.lower() == 'mysql': query = ''.join(['ALTER TABLE ',p.class_table,' ORDER BY ',p.image_id,' ASC, ',p.object_id,' ASC']) db.execute(query) db.Commit() def FilterObjectsFromClassN(self, classN = None, keys = None): ''' Filter the input objects to output the keys of those in classN, using a defined SVM model classifier. ''' # Retrieve instance of the database connection db = dbconnect.DBConnect.getInstance() object_data = {} if isinstance(keys, str): object_data[0] = db.GetCellDataForClassifier(keys) elif keys != []: if len(keys) == len(dbconnect.image_key_columns()): # Retrieve instance of the data model and retrieve objects in the requested image dm = DataModel.getInstance() obKeys = dm.GetObjectsFromImage(keys[0]) else: obKeys = keys for key in obKeys: object_data[key] = db.GetCellDataForClassifier(key) sorted_keys = sorted(object_data.keys()) values_array = np.array([object_data[key] for key in sorted_keys]) scaled_values = self.ScaleData(values_array) pred_labels = self.model.predict(scaled_values) # Group the object keys per class classObjects = {} for index in range(1, len(self.classBins)+1): classObjects[float(index)] = [] for index, label in enumerate(pred_labels): classObjects[np.int(label)+1].append(sorted_keys[index]) # Return either a summary of all classes and their corresponding objects # or just the objects for a specific class if classN is None: return classObjects else: return classObjects[classN] def IsTrained(self): return self.model is not None def LinearScale(self, value, low_lim, up_lim, feat_min, feat_max): return low_lim + (up_lim-low_lim)*(value-feat_min) / (feat_max-feat_min) def LoadModel(self, model_file_name): import cPickle fh = open(model_file_name, 'r') try: self.model, self.bin_labels, self.feat_min, self.feat_max = cPickle.load(fh) except: self.model = None self.bin_labels = None self.feat_min = None self.feat_max = None logging.error('The loaded model was not a support vector machines model') raise TypeError finally: fh.close() def ParameterGridSearch(self, callback = None, nValidation = 5): ''' Grid search for the best C and gamma parameters for the RBF Kernel. The efficiency of the parameters is evaluated using nValidation-fold cross-validation of the training data. As this process is time consuming and parallelizable, a number of threads equal to the number of cores in the computer is used for the calculations ''' from scikits.learn.grid_search import GridSearchCV from scikits.learn.metrics import precision_score from scikits.learn.cross_val import StratifiedKFold # # XXX: program crashes with >1 worker when running cpa.py # No crash when running from classifier.py. Why? # n_workers = 1 #try: #from multiprocessing import cpu_count #n_workers = cpu_count() #except: #n_workers = 1 # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting parameters = {'C': 2**np.arange(-5,11,2, dtype=float), 'gamma': 2**np.arange(3,-11,-2, dtype=float)} clf = GridSearchCV(SVC(kernel='rbf'), parameters, n_jobs=n_workers, score_func=precision_score) clf.fit(self.svm_train_values, self.svm_train_labels, cv=StratifiedKFold(self.svm_train_labels, nValidation)) # Pick the best parameters as the ones with the maximum cross-validation rate bestParameters = max(clf.grid_scores_, key=lambda a: a[1]) bestC = bestParameters[0]['C'] bestGamma = bestParameters[0]['gamma'] logging.info('Optimal values: C=%s g=%s rate=%s'% (bestC, bestGamma, bestParameters[1])) return bestC, bestGamma def PerImageCounts(self, filter_name=None, cb=None): # Clear the current perClassObjects storage for bin in self.classBins: self.perClassObjects[bin.label] = [] # Retrieve a data model instance dm = DataModel.getInstance() # Retrieve image keys and initialize variables imageKeys = dm.GetAllImageKeys(filter_name) imageAmount = float(len(imageKeys)) perImageData = [] # Process all images for k_index, imKey in enumerate(imageKeys): try: # Retrieve the keys of the objects in the current image obKeys = dm.GetObjectsFromImage(imKey) except: raise ValueError('No such image: %s' % (imKey,)) # Calculate the amount of hits for each of the classes in the current image classHits = {} objectCount = [imKey[0]] if obKeys: classObjects = self.FilterObjectsFromClassN(keys = [imKey]) for clNum, bin in enumerate(self.classBins): # Get the objects from the image which belong to the selected class classHits[bin.label] = classObjects[float(clNum+1)] # Store the total object count of this class for the current image nrHits = len(classHits[bin.label]) objectCount.append(nrHits) # Store the objects for the current class and image grouped # by class if any are found for this class in the selected image if nrHits > 0: self.perClassObjects[bin.label] += classHits[bin.label] else: # If there are objects in the image, add zeros for all bins [objectCount.append(0) for bin in self.classBins] # Store the results for the current image and update the callback # function if available perImageData.append(objectCount) if cb: cb(min(1, k_index/imageAmount)) return perImageData def SaveModel(self, model_file_name, bin_labels): import cPickle fh = open(model_file_name, 'w') cPickle.dump((self.model, bin_labels, self.feat_min, self.feat_max), fh) fh.close() def ScaleData(self, values, low_lim=0.0, up_lim=1.0): ''' Linearly scale the data to improve the efficiency of the classifier ''' row, col = np.shape(values) scaled_data = np.zeros((row, col)) for j in xrange(col): scaled_data[:,j] = self.LinearScale(values[:,j], low_lim, up_lim, self.feat_min[j], self.feat_max[j]) return scaled_data def ShowModel(self): if self.model is not None: return 'Trained the following support vector machines classifier:\n%s' % self.model.named_steps['svc'] else: return '' def Train(self, colNames, nValidation, labels, values, fout=None, callback = None): ''' Train a SVM model using optimized C and Gamma parameters and a training set. ''' # First make sure the supplied problem is in SVM format self.TranslateTrainingSet(labels, values) # Perform a grid-search to obtain the C and gamma parameters for C-SVM # classification if nValidation > 1: C, gamma = self.ParameterGridSearch(callback, nValidation) else: C, gamma = self.ParameterGridSearch(callback) # Train the model using the obtained C and gamma parameters to obtain the final classifier self.model = Pipeline([('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1))]) self.model.fit(self.svm_train_values, self.svm_train_labels) def TranslateTrainingSet(self, labels, values): ''' Translate and scale CPAnalyst Classifier training set labels and values to the SVM problem format. ''' adata = np.nan_to_num(np.array(values)) self.feat_min = adata.min(axis=0) self.feat_max = adata.max(axis=0) self.feat_min[0] = 0.0 values = self.ScaleData(adata) self.svm_train_labels, self.svm_train_values = self.ConvertToSVMFormat(labels, values) def UpdateBins(self, classBins): self.classBins = classBins # Reinitialize the objects per class storage self.perClassObjects = {} for bin in self.classBins: self.perClassObjects[bin.label] = [] def XValidate(self, nPermutations): # Make sure all data is available in the training set if not self.classifier.UpdateTrainingSet(): return # Initialize process dialog def cb(frac): cont, skip = dlg.Update(int(frac * 100.), '%d%% Complete'%(frac * 100.)) if not cont: # Cancel was pressed dlg.Destroy() raise StopCalculating() dlg = wx.ProgressDialog('Performing grid search for optimal parameters...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) # Define cross validation parameters totalGroups = 5 trainingGroups = 4 # Convert the training set into SVM format and search for optimal parameters # C and gamma using 5-fold cross-validation logging.info('Performing grid search for parameters C and gamma on entire training set...') self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix, self.classifier.trainingSet.values) C, gamma = self.ParameterGridSearch(callback=cb) dlg.Destroy() logging.info('Grid search completed. Found optimal C=%d and gamma=%f.' % (C, gamma)) # Create the classifier and initialize misclassification storage classifier = Pipeline([('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1))]) nObjects = self.classifier.trainingSet.label_matrix.shape[0] subsetSize = np.ceil(nObjects / float(totalGroups)) indices = np.arange(nObjects) misclassifications = [[] for i in range(nObjects)] # Create group combinations and arrays of all labels and values dt = ','.join('i'*trainingGroups) trainingTotalGroups = list(np.fromiter(combinations(range(totalGroups),trainingGroups), dtype=dt, count=-1)) #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups)) allLabels = np.array(self.svm_train_labels) allValues = np.array(self.svm_train_values) # For all permutations of the subsets train the classifier on 4 totalGroups and # classify the remaining group for a number of random subsets logging.info('Calculating average classification accuracy %d times over a ' \ '%0.1f%%/%0.1f%% cross-validation process' % \ (nPermutations, trainingGroups/float(totalGroups)*100, \ (1-trainingGroups/float(totalGroups))*100)) dlg = wx.ProgressDialog('Calculating average cross-validation accuracy...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) nTrainingTotalGroups = len(trainingTotalGroups) nOperations = float(nPermutations * nTrainingTotalGroups) for per in range(nPermutations): # Split the training set into subsets np.random.shuffle(indices) lastGroupStart = (totalGroups-1)*subsetSize subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups-1)) subsets.append(indices[lastGroupStart:],) for index, group in enumerate(trainingTotalGroups): # Retrieve indices of all objects in the training set trainingSet = np.hstack([subsets[i] for i in range(totalGroups) if i in group]) # Train a classifier on the subset classifier.fit(allValues[trainingSet], allLabels[trainingSet]) # Predict the test set using the trained classifier testSet = np.hstack([subsets[i] for i in range(totalGroups) if i not in group]) testLabels = classifier.predict(allValues[testSet]) # Store all misclassifications [misclassifications[testSet[i]].append(testLabels[i]) \ for i in range(len(testLabels)) \ if testLabels[i] != allLabels[testSet][i]] # Update progress dialog cb((nTrainingTotalGroups * per + index) / nOperations) # Calculate average classification accuracy dlg.Destroy() logging.info('Average Classification Accuracy: %f%%' % \ ((1-len([item for sublist in misclassifications for item in sublist]) /\ float(nObjects * nPermutations))*100)) return misclassifications
def XValidate(self, nPermutations): # Make sure all data is available in the training set if not self.classifier.UpdateTrainingSet(): return # Initialize process dialog def cb(frac): cont, skip = dlg.Update(int(frac * 100.), '%d%% Complete'%(frac * 100.)) if not cont: # Cancel was pressed dlg.Destroy() raise StopCalculating() dlg = wx.ProgressDialog('Performing grid search for optimal parameters...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) # Define cross validation parameters totalGroups = 5 trainingGroups = 4 # Convert the training set into SVM format and search for optimal parameters # C and gamma using 5-fold cross-validation logging.info('Performing grid search for parameters C and gamma on entire training set...') self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix, self.classifier.trainingSet.values) C, gamma = self.ParameterGridSearch(callback=cb) dlg.Destroy() logging.info('Grid search completed. Found optimal C=%d and gamma=%f.' % (C, gamma)) # Create the classifier and initialize misclassification storage classifier = Pipeline([('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1))]) nObjects = self.classifier.trainingSet.label_matrix.shape[0] subsetSize = np.ceil(nObjects / float(totalGroups)) indices = np.arange(nObjects) misclassifications = [[] for i in range(nObjects)] # Create group combinations and arrays of all labels and values dt = ','.join('i'*trainingGroups) trainingTotalGroups = list(np.fromiter(combinations(range(totalGroups),trainingGroups), dtype=dt, count=-1)) #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups)) allLabels = np.array(self.svm_train_labels) allValues = np.array(self.svm_train_values) # For all permutations of the subsets train the classifier on 4 totalGroups and # classify the remaining group for a number of random subsets logging.info('Calculating average classification accuracy %d times over a ' \ '%0.1f%%/%0.1f%% cross-validation process' % \ (nPermutations, trainingGroups/float(totalGroups)*100, \ (1-trainingGroups/float(totalGroups))*100)) dlg = wx.ProgressDialog('Calculating average cross-validation accuracy...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) nTrainingTotalGroups = len(trainingTotalGroups) nOperations = float(nPermutations * nTrainingTotalGroups) for per in range(nPermutations): # Split the training set into subsets np.random.shuffle(indices) lastGroupStart = (totalGroups-1)*subsetSize subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups-1)) subsets.append(indices[lastGroupStart:],) for index, group in enumerate(trainingTotalGroups): # Retrieve indices of all objects in the training set trainingSet = np.hstack([subsets[i] for i in range(totalGroups) if i in group]) # Train a classifier on the subset classifier.fit(allValues[trainingSet], allLabels[trainingSet]) # Predict the test set using the trained classifier testSet = np.hstack([subsets[i] for i in range(totalGroups) if i not in group]) testLabels = classifier.predict(allValues[testSet]) # Store all misclassifications [misclassifications[testSet[i]].append(testLabels[i]) \ for i in range(len(testLabels)) \ if testLabels[i] != allLabels[testSet][i]] # Update progress dialog cb((nTrainingTotalGroups * per + index) / nOperations) # Calculate average classification accuracy dlg.Destroy() logging.info('Average Classification Accuracy: %f%%' % \ ((1-len([item for sublist in misclassifications for item in sublist]) /\ float(nObjects * nPermutations))*100)) return misclassifications
#categories = None print "Loading 20 newsgroups dataset for categories:" print categories data = load_20newsgroups(subset='train', categories=categories) print "%d documents" % len(data.filenames) print "%d categories" % len(data.target_names) print ################################################################################ # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) parameters = { # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way 'vect__max_df': (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), 'vect__analyzer__max_n': (1, 2), # words or bigrams # 'tfidf__use_idf': (True, False), 'clf__alpha': (0.00001, 0.000001), 'clf__penalty': ('l2', 'elasticnet'), # 'clf__n_iter': (10, 50, 80), }
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise ############################################################################### # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(len(y), 2) # cross-validation generator for model selection ridge = BayesianRidge() mem = Memory(cachedir='.', verbose=1) # Ward agglomeration followed by BayesianRidge A = grid_to_graph(n_x=size, n_y=size) ward = WardAgglomeration(n_clusters=10, connectivity=A, memory=mem, n_components=1) clf = Pipeline([('ward', ward), ('ridge', ridge)]) parameters = {'ward__n_clusters': [10, 20, 30]} # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, parameters, n_jobs=1) clf.fit(X, y, cv=cv) # set the best parameters coef_ = clf.best_estimator.steps[-1][1].coef_ coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) clf = Pipeline([('anova', anova), ('ridge', ridge)]) parameters = {'anova__percentile': [5, 10, 20]} # Select the optimal percentage of features with grid search clf = GridSearchCV(clf, parameters)
y_train = dataset.target[:n_samples_total / 2] y_test = dataset.target[n_samples_total / 2:] # Build a an analyzer that split strings into sequence of 1 to 3 characters # after using the previous preprocessor analyzer = CharNGramAnalyzer( min_n=1, max_n=3, preprocessor=LowerCasePreprocessor(), ) # Build a vectorizer / classifier pipeline using the previous analyzer clf = Pipeline([ ('vec', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)), ]) # Fit the pipeline on the training set clf.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test) # Print the classification report print metrics.classification_report(y_test, y_predicted, class_names=dataset.target_names) # Plot the confusion matrix
y_train = dataset.target[:n_samples_total/2] y_test = dataset.target[n_samples_total/2:] # Build a an analyzer that split strings into sequence of 1 to 3 characters # after using the previous preprocessor analyzer = CharNGramAnalyzer( min_n=1, max_n=3, preprocessor=LowerCasePreprocessor(), ) # Build a vectorizer / classifier pipeline using the previous analyzer clf = Pipeline([ ('vec', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)), ]) # Fit the pipeline on the training set clf.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test) # Print the classification report print metrics.classification_report(y_test, y_predicted, class_names=dataset.target_names) # Plot the confusion matrix cm = metrics.confusion_matrix(y_test, y_predicted)
# split the dataset in training and test set: n_samples_total = dataset.filenames.shape[0] split = (n_samples_total * 3) / 4 docs_train = [open(f).read() for f in dataset.filenames[:split]] docs_test = [open(f).read() for f in dataset.filenames[split:]] y_train = dataset.target[:split] y_test = dataset.target[split:] # Build a vectorizer / classifier pipeline using the previous analyzer pipeline = Pipeline([ ('vect', CountVectorizer(max_features=100000)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(C=1000)), ]) parameters = { 'vect__analyzer__max_n': (1, 2), 'vect__max_df': (.95, ), } # Fit the pipeline on the training set using grid search for the parameters grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) grid_search.fit(docs_train[:200], y_train[:200]) # Refit the best parameter set on the complete training set clf = grid_search.best_estimator.fit(docs_train, y_train)
import numpy as np import pylab as pl from scikits.learn import linear_model, decomposition, datasets, cross_val logistic = linear_model.LogisticRegression() pca = decomposition.PCA() from scikits.learn.pipeline import Pipeline pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)]) digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target ################################################################################ # Plot the PCA spectrum pca.fit(X_digits) pl.figure(1, figsize=(4, 3)) pl.clf() pl.axes([.2, .2, .7, .7]) pl.plot(pca.explained_variance_, linewidth=2) pl.axis('tight') pl.xlabel('n_components') pl.ylabel('explained_variance_') ################################################################################ # Prediction scores = cross_val.cross_val_score(pipe, X_digits, y_digits, n_jobs=-1)
y = digits.target # Throw away data, to be in the curse of dimension settings y = y[:200] X = digits.data[:200] n_samples = len(y) X = X.reshape((n_samples, -1)) # add 200 non-informative features X = np.hstack((X, 2 * np.random.random((n_samples, 200)))) ################################################################################ # Create a feature-selection transform and an instance of SVM that we # combine together to have an full-blown estimator transform = feature_selection.SelectPercentile(feature_selection.f_classif) clf = Pipeline([('anova', transform), ('svc', svm.SVC())]) ################################################################################ # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100) for percentile in percentiles: clf._set_params(anova__percentile=percentile) # Compute cross-validation score using all CPUs this_scores = cross_val.cross_val_score(clf, X, y, n_jobs=1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) pl.errorbar(percentiles, score_means, np.array(score_stds))
class SupportVectorMachines(object): ''' Class to define a complete support vector machine classifier calculation problem. ''' def __init__(self, classifier=None): logging.info('Initialized New Support Vector Machines Classifier') self.model = None self.classBins = [] self.classifier = classifier self.percentile = 90 # Initialize the total object storage self.perClassObjects = {} self.feat_min, self.feat_max = None, None self.svm_train_labels, self.svm_train_values = None, None def CheckProgress(self): # Calculate cross-validation data nPermutations = 10 try: misclassifications = self.XValidate(nPermutations) except StopCalculating: return def confusionMatrix(): # Open confusion matrix confusionMatrix, axes = self.ConfusionMatrix( self.svm_train_labels, [ misclassifications[i] + [val] * (nPermutations - len(misclassifications[i])) for i, val in enumerate(self.svm_train_labels) ]) self.classifier.ShowConfusionMatrix(confusionMatrix, axes) def dimensionReduction(): # Initialize PCA/tSNE plot pca_main = dr.PlotMain(self.classifier, properties=Properties.getInstance(), loadData=False) pca_main.set_data( self.classifier.trainingSet.values, dict([(index, object) for index, object in enumerate( self.classifier.trainingSet.get_object_keys())]), np.int64(self.classifier.trainingSet.label_matrix > 0), self.classifier.trainingSet.labels, np.array([ len(misclassifications[i]) / float(nPermutations) for i in xrange(len(misclassifications)) ]).round(2)) pca_main.Show(True) # Ask how the user wants to visualize the cross-validation results (either through # a confusion matrix or visually in a dimension reductionality plot) visualizationChoiceBox(self.classifier, -1, 'Pick cross-validation visualization', confusionMatrix, dimensionReduction) def ClearModel(self): # Clear all parameters related to the trained classifier self.classBins = [] self.model = None self.feat_min, self.feat_max = None, None self.svm_train_labels, self.svm_train_values = None, None def ComplexityTxt(self): return '# of cross-validations: ' def ConfusionMatrix(self, actual=None, predicted=None): # Retrieve the number of classes, their labels and initialize # the confusion matrix nClasses = len(self.classBins) confusionMatrix = np.zeros((nClasses, nClasses), np.int64) classLabels = [bin.label for bin in self.classBins] # For each of the objects used to train the classifier, check what class # it was predicted to have been by the classifier if actual is None or predicted is None: for actualClassNum, actualClassObjects in \ enumerate([bin.GetObjectKeys() for bin in self.classBins]): for predictedLabel in [(classLabels[i], i) for i in range(nClasses)]: confusionMatrix[predictedLabel[1], actualClassNum] += \ len([obj for obj in actualClassObjects if \ obj in self.perClassObjects[predictedLabel[0]]]) else: # Generate the confusion matrix for a list of actual and predicted classes for i, actualClass in enumerate(actual): # Count the number of correct classifications and store them in the # confusion matrix actualClass = np.int(actualClass) # Count all misclassifications for j in predicted[i]: confusionMatrix[np.int(j), actualClass] += 1 return confusionMatrix, classLabels def ConvertToSVMFormat(self, labels, values): ''' Convert the training set data to SVM format Format: label feature_1:value feature_2:value feature_3:value ... ''' labels = np.array([np.nonzero(target > 0) for target in labels]).squeeze() return labels, values def CreatePerObjectClassTable(self, classes): ''' Saves object keys and classes to a SQL table ''' p = Properties.getInstance() if p.class_table is None: raise ValueError('"class_table" in properties file is not set.') index_cols = dbconnect.UniqueObjectClause() class_cols = dbconnect.UniqueObjectClause() + ', class, class_number' class_col_defs = dbconnect.object_key_defs( ) + ', class VARCHAR (%d)' % ( max([len(c.label) for c in self.classBins]) + 1) + ', class_number INT' # Drop must be explicitly asked for Classifier.ScoreAll db = dbconnect.DBConnect.getInstance() db.execute('DROP TABLE IF EXISTS %s' % (p.class_table)) db.execute('CREATE TABLE %s (%s)' % (p.class_table, class_col_defs)) db.execute('CREATE INDEX idx_%s ON %s (%s)' % (p.class_table, p.class_table, index_cols)) for clNum, clName in enumerate(self.perClassObjects.keys()): for obj in self.perClassObjects[clName]: query = ''.join([ 'INSERT INTO ', p.class_table, ' (', class_cols, ') VALUES (', str(obj[0]), ', ', str(obj[1]), ', "', clName, '", ', str(clNum + 1), ')' ]) db.execute(query) if p.db_type.lower() == 'mysql': query = ''.join([ 'ALTER TABLE ', p.class_table, ' ORDER BY ', p.image_id, ' ASC, ', p.object_id, ' ASC' ]) db.execute(query) db.Commit() def FilterObjectsFromClassN(self, classN=None, keys=None): ''' Filter the input objects to output the keys of those in classN, using a defined SVM model classifier. ''' # Retrieve instance of the database connection db = dbconnect.DBConnect.getInstance() object_data = {} if isinstance(keys, str): object_data[0] = db.GetCellDataForClassifier(keys) elif keys != []: if len(keys) == len(dbconnect.image_key_columns()): # Retrieve instance of the data model and retrieve objects in the requested image dm = DataModel.getInstance() obKeys = dm.GetObjectsFromImage(keys[0]) else: obKeys = keys for key in obKeys: object_data[key] = db.GetCellDataForClassifier(key) sorted_keys = sorted(object_data.keys()) values_array = np.array([object_data[key] for key in sorted_keys]) scaled_values = self.ScaleData(values_array) pred_labels = self.model.predict(scaled_values) # Group the object keys per class classObjects = {} for index in range(1, len(self.classBins) + 1): classObjects[float(index)] = [] for index, label in enumerate(pred_labels): classObjects[np.int(label) + 1].append(sorted_keys[index]) # Return either a summary of all classes and their corresponding objects # or just the objects for a specific class if classN is None: return classObjects else: return classObjects[classN] def IsTrained(self): return self.model is not None def LinearScale(self, value, low_lim, up_lim, feat_min, feat_max): return low_lim + (up_lim - low_lim) * (value - feat_min) / (feat_max - feat_min) def LoadModel(self, model_file_name): import cPickle fh = open(model_file_name, 'r') try: self.model, self.bin_labels, self.feat_min, self.feat_max = cPickle.load( fh) except: self.model = None self.bin_labels = None self.feat_min = None self.feat_max = None logging.error( 'The loaded model was not a support vector machines model') raise TypeError finally: fh.close() def ParameterGridSearch(self, callback=None, nValidation=5): ''' Grid search for the best C and gamma parameters for the RBF Kernel. The efficiency of the parameters is evaluated using nValidation-fold cross-validation of the training data. As this process is time consuming and parallelizable, a number of threads equal to the number of cores in the computer is used for the calculations ''' from scikits.learn.grid_search import GridSearchCV from scikits.learn.metrics import precision_score from scikits.learn.cross_val import StratifiedKFold # # XXX: program crashes with >1 worker when running cpa.py # No crash when running from classifier.py. Why? # n_workers = 1 #try: #from multiprocessing import cpu_count #n_workers = cpu_count() #except: #n_workers = 1 # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting parameters = { 'C': 2**np.arange(-5, 11, 2, dtype=float), 'gamma': 2**np.arange(3, -11, -2, dtype=float) } clf = GridSearchCV(SVC(kernel='rbf'), parameters, n_jobs=n_workers, score_func=precision_score) clf.fit(self.svm_train_values, self.svm_train_labels, cv=StratifiedKFold(self.svm_train_labels, nValidation)) # Pick the best parameters as the ones with the maximum cross-validation rate bestParameters = max(clf.grid_scores_, key=lambda a: a[1]) bestC = bestParameters[0]['C'] bestGamma = bestParameters[0]['gamma'] logging.info('Optimal values: C=%s g=%s rate=%s' % (bestC, bestGamma, bestParameters[1])) return bestC, bestGamma def PerImageCounts(self, filter_name=None, cb=None): # Clear the current perClassObjects storage for bin in self.classBins: self.perClassObjects[bin.label] = [] # Retrieve a data model instance dm = DataModel.getInstance() # Retrieve image keys and initialize variables imageKeys = dm.GetAllImageKeys(filter_name) imageAmount = float(len(imageKeys)) perImageData = [] # Process all images for k_index, imKey in enumerate(imageKeys): try: # Retrieve the keys of the objects in the current image obKeys = dm.GetObjectsFromImage(imKey) except: raise ValueError('No such image: %s' % (imKey, )) # Calculate the amount of hits for each of the classes in the current image classHits = {} objectCount = [imKey[0]] if obKeys: classObjects = self.FilterObjectsFromClassN(keys=[imKey]) for clNum, bin in enumerate(self.classBins): # Get the objects from the image which belong to the selected class classHits[bin.label] = classObjects[float(clNum + 1)] # Store the total object count of this class for the current image nrHits = len(classHits[bin.label]) objectCount.append(nrHits) # Store the objects for the current class and image grouped # by class if any are found for this class in the selected image if nrHits > 0: self.perClassObjects[bin.label] += classHits[bin.label] else: # If there are objects in the image, add zeros for all bins [objectCount.append(0) for bin in self.classBins] # Store the results for the current image and update the callback # function if available perImageData.append(objectCount) if cb: cb(min(1, k_index / imageAmount)) return perImageData def SaveModel(self, model_file_name, bin_labels): import cPickle fh = open(model_file_name, 'w') cPickle.dump((self.model, bin_labels, self.feat_min, self.feat_max), fh) fh.close() def ScaleData(self, values, low_lim=0.0, up_lim=1.0): ''' Linearly scale the data to improve the efficiency of the classifier ''' row, col = np.shape(values) scaled_data = np.zeros((row, col)) for j in xrange(col): scaled_data[:, j] = self.LinearScale(values[:, j], low_lim, up_lim, self.feat_min[j], self.feat_max[j]) return scaled_data def ShowModel(self): if self.model is not None: return 'Trained the following support vector machines classifier:\n%s' % self.model.named_steps[ 'svc'] else: return '' def Train(self, colNames, nValidation, labels, values, fout=None, callback=None): ''' Train a SVM model using optimized C and Gamma parameters and a training set. ''' # First make sure the supplied problem is in SVM format self.TranslateTrainingSet(labels, values) # Perform a grid-search to obtain the C and gamma parameters for C-SVM # classification if nValidation > 1: C, gamma = self.ParameterGridSearch(callback, nValidation) else: C, gamma = self.ParameterGridSearch(callback) # Train the model using the obtained C and gamma parameters to obtain the final classifier self.model = Pipeline([ ('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1)) ]) self.model.fit(self.svm_train_values, self.svm_train_labels) def TranslateTrainingSet(self, labels, values): ''' Translate and scale CPAnalyst Classifier training set labels and values to the SVM problem format. ''' adata = np.nan_to_num(np.array(values)) self.feat_min = adata.min(axis=0) self.feat_max = adata.max(axis=0) self.feat_min[0] = 0.0 values = self.ScaleData(adata) self.svm_train_labels, self.svm_train_values = self.ConvertToSVMFormat( labels, values) def UpdateBins(self, classBins): self.classBins = classBins # Reinitialize the objects per class storage self.perClassObjects = {} for bin in self.classBins: self.perClassObjects[bin.label] = [] def XValidate(self, nPermutations): # Make sure all data is available in the training set if not self.classifier.UpdateTrainingSet(): return # Initialize process dialog def cb(frac): cont, skip = dlg.Update(int(frac * 100.), '%d%% Complete' % (frac * 100.)) if not cont: # Cancel was pressed dlg.Destroy() raise StopCalculating() dlg = wx.ProgressDialog( 'Performing grid search for optimal parameters...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) # Define cross validation parameters totalGroups = 5 trainingGroups = 4 # Convert the training set into SVM format and search for optimal parameters # C and gamma using 5-fold cross-validation logging.info( 'Performing grid search for parameters C and gamma on entire training set...' ) self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix, self.classifier.trainingSet.values) C, gamma = self.ParameterGridSearch(callback=cb) dlg.Destroy() logging.info( 'Grid search completed. Found optimal C=%d and gamma=%f.' % (C, gamma)) # Create the classifier and initialize misclassification storage classifier = Pipeline([ ('anova', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=self.percentile)), ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1)) ]) nObjects = self.classifier.trainingSet.label_matrix.shape[0] subsetSize = np.ceil(nObjects / float(totalGroups)) indices = np.arange(nObjects) misclassifications = [[] for i in range(nObjects)] # Create group combinations and arrays of all labels and values dt = ','.join('i' * trainingGroups) trainingTotalGroups = list( np.fromiter(combinations(range(totalGroups), trainingGroups), dtype=dt, count=-1)) #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups)) allLabels = np.array(self.svm_train_labels) allValues = np.array(self.svm_train_values) # For all permutations of the subsets train the classifier on 4 totalGroups and # classify the remaining group for a number of random subsets logging.info('Calculating average classification accuracy %d times over a ' \ '%0.1f%%/%0.1f%% cross-validation process' % \ (nPermutations, trainingGroups/float(totalGroups)*100, \ (1-trainingGroups/float(totalGroups))*100)) dlg = wx.ProgressDialog( 'Calculating average cross-validation accuracy...', '0% Complete', 100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT) nTrainingTotalGroups = len(trainingTotalGroups) nOperations = float(nPermutations * nTrainingTotalGroups) for per in range(nPermutations): # Split the training set into subsets np.random.shuffle(indices) lastGroupStart = (totalGroups - 1) * subsetSize subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups - 1)) subsets.append(indices[lastGroupStart:], ) for index, group in enumerate(trainingTotalGroups): # Retrieve indices of all objects in the training set trainingSet = np.hstack( [subsets[i] for i in range(totalGroups) if i in group]) # Train a classifier on the subset classifier.fit(allValues[trainingSet], allLabels[trainingSet]) # Predict the test set using the trained classifier testSet = np.hstack( [subsets[i] for i in range(totalGroups) if i not in group]) testLabels = classifier.predict(allValues[testSet]) # Store all misclassifications [misclassifications[testSet[i]].append(testLabels[i]) \ for i in range(len(testLabels)) \ if testLabels[i] != allLabels[testSet][i]] # Update progress dialog cb((nTrainingTotalGroups * per + index) / nOperations) # Calculate average classification accuracy dlg.Destroy() logging.info('Average Classification Accuracy: %f%%' % \ ((1-len([item for sublist in misclassifications for item in sublist]) /\ float(nObjects * nPermutations))*100)) return misclassifications
y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[0]) noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise ############################################################################### # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(len(y), 2) # cross-validation generator for model selection ridge = BayesianRidge() mem = Memory(cachedir='.', verbose=1) # Ward agglomeration followed by BayesianRidge A = grid_to_graph(n_x=size, n_y=size) ward = WardAgglomeration(n_clusters=10, connectivity=A, memory=mem, n_components=1) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1) clf.fit(X, y, cv=cv) # set the best parameters coef_ = clf.best_estimator.steps[-1][1].coef_ coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) clf = Pipeline([('anova', anova), ('ridge', ridge)]) # Select the optimal percentage of features with grid search clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}) clf.fit(X, y, cv=cv) # set the best parameters coef_ = clf.best_estimator.steps[-1][1].coef_
""" ================== Pipeline Anova SVM ================== Simple usage of Pipeline that runs successively a univariate feature selection with anova and then a C-SVM of the selected features. """ print __doc__ from scikits.learn import svm from scikits.learn.datasets import samples_generator from scikits.learn.feature_selection import SelectKBest, f_regression from scikits.learn.pipeline import Pipeline # import some data to play with X, y = samples_generator.make_classification( n_features=20, n_informative=3, n_redundant=0, n_classes=4, n_clusters_per_class=2) # ANOVA SVM-C # 1) anova filter, take 3 best ranked features anova_filter = SelectKBest(f_regression, k=3) # 2) svm clf = svm.SVC(kernel='linear') anova_svm = Pipeline([('anova', anova_filter), ('svm', clf)]) anova_svm.fit(X, y) anova_svm.predict(X)
""" ================== Pipeline Anova SVM ================== Simple usage of Pipeline that runs successively a univariate feature selection with anova and then a C-SVM of the selected features. """ print __doc__ from scikits.learn import svm from scikits.learn.datasets import samples_generator from scikits.learn.feature_selection import SelectKBest, f_regression from scikits.learn.pipeline import Pipeline # import some data to play with X, y = samples_generator.test_dataset_classif(k=5) # ANOVA SVM-C # 1) anova filter, take 5 best ranked features anova_filter = SelectKBest(f_regression, k=5) # 2) svm clf = svm.SVC(kernel='linear') anova_svm = Pipeline([('anova', anova_filter), ('svm', clf)]) anova_svm.fit(X, y) anova_svm.predict(X)
y = digits.target # Throw away data, to be in the curse of dimension settings y = y[:200] X = digits.data[:200] n_samples = len(y) X = X.reshape((n_samples, -1)) # add 200 non-informative features X = np.hstack((X, 2*np.random.random((n_samples, 200)))) ################################################################################ # Create a feature-selection transform and an instance of SVM that we # combine together to have an full-blown estimator transform = feature_selection.SelectPercentile(feature_selection.f_classif) clf = Pipeline([('anova', transform), ('svc', svm.SVC())]) ################################################################################ # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100) for percentile in percentiles: clf._set_params(anova__percentile=percentile) # Compute cross-validation score using all CPUs this_scores = cross_val.cross_val_score(clf, X, y, n_jobs=1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) pl.errorbar(percentiles, score_means, np.array(score_stds))
import numpy as np import matplotlib.pyplot as pl from scikits.learn.decomposition import RandomizedPCA from scikits.learn.svm import LinearSVC from scikits.learn.pipeline import Pipeline from scikits.learn.grid_search import GridSearchCV from scikits.learn.metrics import classification_report from preprocess import InfinitivesExtractor, load_data # Data attributes targets = [0, 1, 2] target_names = ["covered", "no alternance", "uncovered"] target_colors = "rgb" # Classification settings pipeline = Pipeline([('extr', InfinitivesExtractor()), ('svc', LinearSVC(multi_class=True))]) parameters = { 'extr__count': (True, False), 'extr__n': (3, 4, 5, 6), 'svc__C': (1e-1, 1e-2, 1e9) } grid_search = GridSearchCV(pipeline, parameters) print "Loading data..." X, y = load_data() print "Searching for the best model..." t0 = time() grid_search.fit(X, y) print "Done in %0.3f" % (time() - t0) print "Best score: %0.3f" % grid_search.best_score clf = grid_search.best_estimator