def train_lrpipe(trainX, trainY, params):
    """ trains LogisiticRegression model with params
        logreg_C specified by params 
        """
    lrpipe = Pipeline([('logreg', LogisticRegression(penalty="l1", C=1))])
    lrpipe = lrpipe.fit(trainX, trainY, **params)
    return lrpipe
    def Train(self,
              colNames,
              nValidation,
              labels,
              values,
              fout=None,
              callback=None):
        '''
    	Train a SVM model using optimized C and Gamma parameters and a training set.
    	'''
        # First make sure the supplied problem is in SVM format
        self.TranslateTrainingSet(labels, values)

        # Perform a grid-search to obtain the C and gamma parameters for C-SVM
        # classification
        if nValidation > 1:
            C, gamma = self.ParameterGridSearch(callback, nValidation)
        else:
            C, gamma = self.ParameterGridSearch(callback)

        # Train the model using the obtained C and gamma parameters to obtain the final classifier
        self.model = Pipeline([
            ('anova',
             feature_selection.SelectPercentile(feature_selection.f_classif,
                                                percentile=self.percentile)),
            ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1))
        ])
        self.model.fit(self.svm_train_values, self.svm_train_labels)
def train_svpipe(trainX, trainY, params):
    """ trains LogisiticRegression model with params
        logreg_C specified by params 
        """
    svpipe = Pipeline([('rbfsvm', SVC())])
    svpipe = svpipe.fit(trainX, trainY, **params)
    return svpipe
Example #4
0
def test_countvectorizer_custom_vocabulary_pipeline():
    what_we_like = ["pizza", "beer"]
    pipe = Pipeline([
        ('count', CountVectorizer(vocabulary=what_we_like)),
        ('tfidf', TfidfTransformer())])
    X = pipe.fit_transform(ALL_FOOD_DOCS)
    assert_equal(set(pipe.named_steps['count'].vocabulary), set(what_we_like))
    assert_equal(X.shape[1], len(what_we_like))
def train_svpipe(trainX, trainY,  params ):
    """ trains LogisiticRegression model with params
        logreg_C specified by params 
        """
    svpipe = Pipeline([
        ('rbfsvm',  SVC()  )
        ])
    svpipe = svpipe.fit(trainX,trainY, **params)
    return svpipe
Example #6
0
    def train(cls, labeled_featuresets):
        train, target_labels = zip(*labeled_featuresets)
        target_names = sorted(set(target_labels))
        targets = [target_names.index(l) for l in target_labels]

        pipeline = Pipeline([("bow", BagOfWordsVectorizer()), ("clf", LinearSVC(C=1000))])

        pipeline.fit(train, targets)
        return cls(pipeline, target_names)
def train_lrpipe(trainX, trainY,  params ):
    """ trains LogisiticRegression model with params
        logreg_C specified by params 
        """
    lrpipe = Pipeline([
        ('logreg',  LogisticRegression(penalty="l1", C=1)  )
        ])
    lrpipe = lrpipe.fit(trainX,trainY, **params)
    return lrpipe
Example #8
0
	def train(cls, labeled_featuresets):
		train, target_labels = zip(*labeled_featuresets)
		target_names = sorted(set(target_labels))
		targets = [target_names.index(l) for l in target_labels]
		
		pipeline = Pipeline([
			('bow', BagOfWordsVectorizer()),
			('clf', LinearSVC(C=1000)),
		])
		
		pipeline.fit(train, targets)
		return cls(pipeline, target_names)
Example #9
0
def test_dense_vectorizer_pipeline_grid_selection():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
    # simulate iterables
    train_data = iter(data[1:-1])
    test_data = iter([data[0], data[-1]])

    # label junk food as -1, the others as +1
    y = np.ones(len(data))
    y[:6] = -1
    y_train = y[1:-1]
    y_test = np.array([y[0], y[-1]])

    pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())])

    parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')}

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)

    # cross-validation doesn't work if the length of the data is not known,
    # hence use lists instead of iterators
    pred = grid_search.fit(list(train_data), y_train).predict(list(test_data))
    assert_array_equal(pred, y_test)

    # on this toy dataset bigram representation which is used in the last of
    # the grid_search is considered the best estimator since they all converge
    # to 100% accurracy models
    assert_equal(grid_search.best_score, 1.0)
    best_vectorizer = grid_search.best_estimator.named_steps['vect']
    assert_equal(best_vectorizer.analyzer.max_n, 1)
Example #10
0
def get_clf(n=3, binarize=True):
    steps = [('vectorizer',
              CountVectorizer(
                  CharNGramAnalyzer(min_n=1,
                                    max_n=n,
                                    preprocessor=SimplePreprocessor())))]
    if binarize:
        steps.append(('binarizer', Binarizer(copy=False)))
        steps.append(('clf', naive_bayes.BernoulliNB()))
    else:
        steps.append(('clf', naive_bayes.MultinomialNB()))

    return Pipeline(steps)
def do_grid_search(X, Y, gs_params):
    """ Given data (X,Y) will perform a grid search on g_params
        for a LogisticRegression called logreg
        """
    lrpipe = Pipeline([('logreg', LogisticRegression())])
    gs = GridSearchCV(lrpipe, gs_params, n_jobs=-1)
    #print gs
    gs = gs.fit(X, Y)

    best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1])
    logger.info("best_parameters: " + str(best_parameters))
    logger.info("expected score: " + str(score))

    return best_parameters
    def Train(self, colNames, nValidation, labels, values, fout=None, callback = None):
        '''
    	Train a SVM model using optimized C and Gamma parameters and a training set.
    	'''
        # First make sure the supplied problem is in SVM format
        self.TranslateTrainingSet(labels, values)

        # Perform a grid-search to obtain the C and gamma parameters for C-SVM
        # classification
        if nValidation > 1:
            C, gamma = self.ParameterGridSearch(callback, nValidation)
        else:
            C, gamma = self.ParameterGridSearch(callback)

        # Train the model using the obtained C and gamma parameters to obtain the final classifier
        self.model = Pipeline([('anova', feature_selection.SelectPercentile(feature_selection.f_classif,
                                                                            percentile=self.percentile)),
                               ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1))])
        self.model.fit(self.svm_train_values, self.svm_train_labels)
def do_grid_search(X, Y, gs_params=None):
    """ Given data (X,Y) will perform a grid search on g_params
        for a LogisticRegression called logreg
        """
    svpipe = Pipeline([('rbfsvm', SVC())])
    if not gs_params:
        gs_params = {
            'rbfsvm__C': (1.5, 2, 5, 10, 20),
            'rbfsvm__gamma': (0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 5),
        }
    gs = GridSearchCV(svpipe, gs_params, n_jobs=-1)
    #print gs
    gs = gs.fit(X, Y)

    best_parameters, score = max(gs.grid_scores_, key=lambda x: x[1])
    logger.info("best_parameters: " + str(best_parameters))
    logger.info("expected score: " + str(score))

    return best_parameters
Example #14
0
from scikits.learn.pipeline import Pipeline

################################################################################
# Import some data to play with
digits = datasets.load_digits()
y = digits.target
n_samples = len(y)
X = digits.data.reshape((n_samples, -1))

################################################################################
# Create a feature-selection transform and an instance of SVM that we
# combine together to have an full-blown estimator

transform = feature_selection.SelectPercentile(feature_selection.f_classif)

clf = Pipeline([transform], svm.SVC())

################################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (10, 20, 30, 40, 50, 60, 70, 80, 90, 100)

for percentile in percentiles:
    transform._set_params(percentile=percentile)
    this_scores = cross_val.cross_val_score(clf, X, y)
    score_means.append(this_scores.mean())
    score_stds.append(this_scores.std())

pl.errorbar(percentiles, score_means, np.array(score_stds))
    def XValidate(self, nPermutations):
        # Make sure all data is available in the training set
        if not self.classifier.UpdateTrainingSet():
            return

        # Initialize process dialog
        def cb(frac):
            cont, skip = dlg.Update(int(frac * 100.),
                                    '%d%% Complete' % (frac * 100.))
            if not cont:  # Cancel was pressed
                dlg.Destroy()
                raise StopCalculating()

        dlg = wx.ProgressDialog(
            'Performing grid search for optimal parameters...', '0% Complete',
            100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME
            | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)

        # Define cross validation parameters
        totalGroups = 5
        trainingGroups = 4

        # Convert the training set into SVM format and search for optimal parameters
        # C and gamma using 5-fold cross-validation
        logging.info(
            'Performing grid search for parameters C and gamma on entire training set...'
        )
        self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix,
                                  self.classifier.trainingSet.values)
        C, gamma = self.ParameterGridSearch(callback=cb)
        dlg.Destroy()
        logging.info(
            'Grid search completed. Found optimal C=%d and gamma=%f.' %
            (C, gamma))

        # Create the classifier and initialize misclassification storage
        classifier = Pipeline([
            ('anova',
             feature_selection.SelectPercentile(feature_selection.f_classif,
                                                percentile=self.percentile)),
            ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1))
        ])
        nObjects = self.classifier.trainingSet.label_matrix.shape[0]
        subsetSize = np.ceil(nObjects / float(totalGroups))
        indices = np.arange(nObjects)
        misclassifications = [[] for i in range(nObjects)]

        # Create group combinations and arrays of all labels and values
        dt = ','.join('i' * trainingGroups)
        trainingTotalGroups = list(
            np.fromiter(combinations(range(totalGroups), trainingGroups),
                        dtype=dt,
                        count=-1))
        #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups))
        allLabels = np.array(self.svm_train_labels)
        allValues = np.array(self.svm_train_values)

        # For all permutations of the subsets train the classifier on 4 totalGroups and
        # classify the remaining group for a number of random subsets
        logging.info('Calculating average classification accuracy %d times over a ' \
                     '%0.1f%%/%0.1f%% cross-validation process' % \
                     (nPermutations, trainingGroups/float(totalGroups)*100, \
                     (1-trainingGroups/float(totalGroups))*100))
        dlg = wx.ProgressDialog(
            'Calculating average cross-validation accuracy...', '0% Complete',
            100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME
            | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)
        nTrainingTotalGroups = len(trainingTotalGroups)
        nOperations = float(nPermutations * nTrainingTotalGroups)
        for per in range(nPermutations):
            # Split the training set into subsets
            np.random.shuffle(indices)
            lastGroupStart = (totalGroups - 1) * subsetSize
            subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups - 1))
            subsets.append(indices[lastGroupStart:], )

            for index, group in enumerate(trainingTotalGroups):
                # Retrieve indices of all objects in the training set
                trainingSet = np.hstack(
                    [subsets[i] for i in range(totalGroups) if i in group])

                # Train a classifier on the subset
                classifier.fit(allValues[trainingSet], allLabels[trainingSet])

                # Predict the test set using the trained classifier
                testSet = np.hstack(
                    [subsets[i] for i in range(totalGroups) if i not in group])
                testLabels = classifier.predict(allValues[testSet])

                # Store all misclassifications
                [misclassifications[testSet[i]].append(testLabels[i]) \
                    for i in range(len(testLabels)) \
                    if testLabels[i] != allLabels[testSet][i]]

                # Update progress dialog
                cb((nTrainingTotalGroups * per + index) / nOperations)

        # Calculate average classification accuracy
        dlg.Destroy()
        logging.info('Average Classification Accuracy: %f%%' % \
                     ((1-len([item for sublist in misclassifications for item in sublist]) /\
                     float(nObjects * nPermutations))*100))

        return misclassifications
class SupportVectorMachines(object):
    '''
    Class to define a complete support vector machine classifier calculation problem. 
    '''    
    def __init__(self, classifier = None):
        logging.info('Initialized New Support Vector Machines Classifier')
        self.model = None 
        self.classBins = []
        self.classifier = classifier
        self.percentile = 90

        # Initialize the total object storage
        self.perClassObjects = {}
        self.feat_min, self.feat_max = None, None
        self.svm_train_labels, self.svm_train_values = None, None

    def CheckProgress(self):
        # Calculate cross-validation data
        nPermutations = 10
        try:
            misclassifications = self.XValidate(nPermutations)
        except StopCalculating:
            return

        def confusionMatrix():
            # Open confusion matrix
            confusionMatrix, axes = self.ConfusionMatrix(
                self.svm_train_labels,
                [misclassifications[i]+[val]*(nPermutations-len(misclassifications[i]))
                 for i, val in enumerate(self.svm_train_labels)]
            )
            self.classifier.ShowConfusionMatrix(confusionMatrix, axes)

        def dimensionReduction():
            # Initialize PCA/tSNE plot
            pca_main = dr.PlotMain(self.classifier, properties = Properties.getInstance(), loadData = False)
            pca_main.set_data(self.classifier.trainingSet.values,
                              dict([(index, object) for index, object in 
                                    enumerate(self.classifier.trainingSet.get_object_keys())]),
                              np.int64(self.classifier.trainingSet.label_matrix > 0),
                              self.classifier.trainingSet.labels,
                              np.array([len(misclassifications[i])/float(nPermutations) for i in xrange(len(misclassifications))]).round(2))
            pca_main.Show(True)

        # Ask how the user wants to visualize the cross-validation results (either through
        # a confusion matrix or visually in a dimension reductionality plot)
        visualizationChoiceBox(self.classifier, -1, 'Pick cross-validation visualization', confusionMatrix, dimensionReduction)

    def ClearModel(self):
        # Clear all parameters related to the trained classifier
        self.classBins = []
        self.model = None
        self.feat_min, self.feat_max = None, None
        self.svm_train_labels, self.svm_train_values = None, None

    def ComplexityTxt(self):
        return '# of cross-validations: '

    def ConfusionMatrix(self, actual = None, predicted = None):
        # Retrieve the number of classes, their labels and initialize
        # the confusion matrix
        nClasses = len(self.classBins)
        confusionMatrix = np.zeros((nClasses, nClasses), np.int64)
        classLabels = [bin.label for bin in self.classBins]

        # For each of the objects used to train the classifier, check what class
        # it was predicted to have been by the classifier
        if actual is None or predicted is None:
            for actualClassNum, actualClassObjects in \
                enumerate([bin.GetObjectKeys() for bin in self.classBins]):
                for predictedLabel in [(classLabels[i], i) for i in range(nClasses)]:
                    confusionMatrix[predictedLabel[1], actualClassNum] += \
                        len([obj for obj in actualClassObjects if \
                        obj in self.perClassObjects[predictedLabel[0]]])
        else:
            # Generate the confusion matrix for a list of actual and predicted classes
            for i, actualClass in enumerate(actual):
                # Count the number of correct classifications and store them in the
                # confusion matrix
                actualClass = np.int(actualClass)

                # Count all misclassifications
                for j in predicted[i]:
                    confusionMatrix[np.int(j), actualClass] += 1

        return confusionMatrix, classLabels

    def ConvertToSVMFormat(self, labels, values):
        '''
        Convert the training set data to SVM format
        Format: label feature_1:value feature_2:value feature_3:value ...
        '''
        labels = np.array([np.nonzero(target > 0) for target in labels]).squeeze()
        return labels, values

    def CreatePerObjectClassTable(self, classes):
        '''
    	Saves object keys and classes to a SQL table
    	'''
        p = Properties.getInstance()
        if p.class_table is None:
            raise ValueError('"class_table" in properties file is not set.')

        index_cols = dbconnect.UniqueObjectClause()
        class_cols = dbconnect.UniqueObjectClause() + ', class, class_number'
        class_col_defs = dbconnect.object_key_defs() + ', class VARCHAR (%d)'%(max([len(c.label) for c in self.classBins])+1) + ', class_number INT'

        # Drop must be explicitly asked for Classifier.ScoreAll
        db = dbconnect.DBConnect.getInstance()
        db.execute('DROP TABLE IF EXISTS %s'%(p.class_table))
        db.execute('CREATE TABLE %s (%s)'%(p.class_table, class_col_defs))
        db.execute('CREATE INDEX idx_%s ON %s (%s)'%(p.class_table, p.class_table, index_cols))
        for clNum, clName in enumerate(self.perClassObjects.keys()):
            for obj in self.perClassObjects[clName]:
                query = ''.join(['INSERT INTO ',p.class_table,' (',class_cols,') VALUES (',str(obj[0]),', ',str(obj[1]),', "',clName,'", ',str(clNum+1),')'])
                db.execute(query)

        if p.db_type.lower() == 'mysql':
            query = ''.join(['ALTER TABLE ',p.class_table,' ORDER BY ',p.image_id,' ASC, ',p.object_id,' ASC'])
            db.execute(query)
            db.Commit()

    def FilterObjectsFromClassN(self, classN = None, keys = None):
        '''
    	Filter the input objects to output the keys of those in classN, 
    	using a defined SVM model classifier.
    	'''
        # Retrieve instance of the database connection
        db = dbconnect.DBConnect.getInstance()
        object_data = {}
        if isinstance(keys, str):
            object_data[0] = db.GetCellDataForClassifier(keys)
        elif keys != []:
            if len(keys) == len(dbconnect.image_key_columns()):
                # Retrieve instance of the data model and retrieve objects in the requested image
                dm = DataModel.getInstance()
                obKeys = dm.GetObjectsFromImage(keys[0])
            else:
                obKeys = keys
            for key in obKeys:
                object_data[key] = db.GetCellDataForClassifier(key)

        sorted_keys = sorted(object_data.keys())
        values_array = np.array([object_data[key] for key in sorted_keys])
        scaled_values = self.ScaleData(values_array)
        pred_labels = self.model.predict(scaled_values)

        # Group the object keys per class
        classObjects = {}
        for index in range(1, len(self.classBins)+1):
            classObjects[float(index)] = []
        for index, label in enumerate(pred_labels):
            classObjects[np.int(label)+1].append(sorted_keys[index])

        # Return either a summary of all classes and their corresponding objects
        # or just the objects for a specific class
        if classN is None:
            return classObjects
        else:
            return classObjects[classN]

    def IsTrained(self):
        return self.model is not None

    def LinearScale(self, value, low_lim, up_lim, feat_min, feat_max):
        return low_lim + (up_lim-low_lim)*(value-feat_min) / (feat_max-feat_min)

    def LoadModel(self, model_file_name):
        import cPickle
        fh = open(model_file_name, 'r')
        try:
            self.model, self.bin_labels, self.feat_min, self.feat_max = cPickle.load(fh)
        except:
            self.model = None
            self.bin_labels = None
            self.feat_min = None
            self.feat_max = None
            logging.error('The loaded model was not a support vector machines model')
            raise TypeError
        finally:
            fh.close()

    def ParameterGridSearch(self, callback = None, nValidation = 5):
        '''
        Grid search for the best C and gamma parameters for the RBF Kernel.
        The efficiency of the parameters is evaluated using nValidation-fold
        cross-validation of the training data.
    
        As this process is time consuming and parallelizable, a number of
        threads equal to the number of cores in the computer is used for the
        calculations
        '''
        from scikits.learn.grid_search import GridSearchCV
        from scikits.learn.metrics import precision_score
        from scikits.learn.cross_val import StratifiedKFold
        # 
        # XXX: program crashes with >1 worker when running cpa.py
        #      No crash when running from classifier.py. Why?
        #
        n_workers = 1
        #try:
            #from multiprocessing import cpu_count
            #n_workers = cpu_count()
        #except:
            #n_workers = 1

        # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting
        parameters = {'C': 2**np.arange(-5,11,2, dtype=float),
                      'gamma': 2**np.arange(3,-11,-2, dtype=float)}                
        clf = GridSearchCV(SVC(kernel='rbf'), parameters, n_jobs=n_workers, score_func=precision_score)
        clf.fit(self.svm_train_values, self.svm_train_labels, 
                cv=StratifiedKFold(self.svm_train_labels, nValidation))

        # Pick the best parameters as the ones with the maximum cross-validation rate
        bestParameters = max(clf.grid_scores_, key=lambda a: a[1])
        bestC = bestParameters[0]['C']
        bestGamma = bestParameters[0]['gamma']
        logging.info('Optimal values: C=%s g=%s rate=%s'%
                     (bestC, bestGamma, bestParameters[1]))
        return bestC, bestGamma

    def PerImageCounts(self, filter_name=None, cb=None):
        # Clear the current perClassObjects storage
        for bin in self.classBins:
            self.perClassObjects[bin.label] = []

        # Retrieve a data model instance
        dm = DataModel.getInstance()

        # Retrieve image keys and initialize variables
        imageKeys = dm.GetAllImageKeys(filter_name)
        imageAmount = float(len(imageKeys))
        perImageData = []

        # Process all images
        for k_index, imKey in enumerate(imageKeys):
            try:
                # Retrieve the keys of the objects in the current image
                obKeys = dm.GetObjectsFromImage(imKey)
            except:
                raise ValueError('No such image: %s' % (imKey,))

            # Calculate the amount of hits for each of the classes in the current image
            classHits = {}
            objectCount = [imKey[0]]
            if obKeys:
                classObjects = self.FilterObjectsFromClassN(keys = [imKey])
                for clNum, bin in enumerate(self.classBins):
                    # Get the objects from the image which belong to the selected class
                    classHits[bin.label] = classObjects[float(clNum+1)]

                    # Store the total object count of this class for the current image
                    nrHits = len(classHits[bin.label])
                    objectCount.append(nrHits)

                    # Store the objects for the current class and image grouped
                    # by class if any are found for this class in the selected image
                    if nrHits > 0:
                        self.perClassObjects[bin.label] += classHits[bin.label]
            else:
                # If there are objects in the image, add zeros for all bins
                [objectCount.append(0) for bin in self.classBins]

            # Store the results for the current image and update the callback
            # function if available
            perImageData.append(objectCount)
            if cb:
                cb(min(1, k_index/imageAmount))

        return perImageData

    def SaveModel(self, model_file_name, bin_labels):       
        import cPickle
        fh = open(model_file_name, 'w')
        cPickle.dump((self.model, bin_labels, self.feat_min, self.feat_max), fh)
        fh.close()

    def ScaleData(self, values, low_lim=0.0, up_lim=1.0):
        '''
    	Linearly scale the data to improve the efficiency of the classifier
    	'''
        row, col = np.shape(values)
        scaled_data = np.zeros((row, col))
        for j in xrange(col):
            scaled_data[:,j] = self.LinearScale(values[:,j], low_lim, up_lim,
                                                self.feat_min[j], self.feat_max[j])
        return scaled_data

    def ShowModel(self):
        if self.model is not None:
            return 'Trained the following support vector machines classifier:\n%s' % self.model.named_steps['svc']
        else:
            return ''

    def Train(self, colNames, nValidation, labels, values, fout=None, callback = None):
        '''
    	Train a SVM model using optimized C and Gamma parameters and a training set.
    	'''
        # First make sure the supplied problem is in SVM format
        self.TranslateTrainingSet(labels, values)

        # Perform a grid-search to obtain the C and gamma parameters for C-SVM
        # classification
        if nValidation > 1:
            C, gamma = self.ParameterGridSearch(callback, nValidation)
        else:
            C, gamma = self.ParameterGridSearch(callback)

        # Train the model using the obtained C and gamma parameters to obtain the final classifier
        self.model = Pipeline([('anova', feature_selection.SelectPercentile(feature_selection.f_classif,
                                                                            percentile=self.percentile)),
                               ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1))])
        self.model.fit(self.svm_train_values, self.svm_train_labels)

    def TranslateTrainingSet(self, labels, values):
        '''
    	Translate and scale CPAnalyst Classifier training set labels and values
    	to the SVM problem format.
    	'''
        adata = np.nan_to_num(np.array(values))
        self.feat_min = adata.min(axis=0)
        self.feat_max = adata.max(axis=0)
        self.feat_min[0] = 0.0
        values = self.ScaleData(adata)
        self.svm_train_labels, self.svm_train_values = self.ConvertToSVMFormat(labels, values)

    def UpdateBins(self, classBins):
        self.classBins = classBins

        # Reinitialize the objects per class storage
        self.perClassObjects = {}
        for bin in self.classBins:
            self.perClassObjects[bin.label] = []

    def XValidate(self, nPermutations):
        # Make sure all data is available in the training set
        if not self.classifier.UpdateTrainingSet():
            return

        # Initialize process dialog
        def cb(frac):
            cont, skip = dlg.Update(int(frac * 100.), '%d%% Complete'%(frac * 100.))
            if not cont: # Cancel was pressed
                dlg.Destroy()
                raise StopCalculating()

        dlg = wx.ProgressDialog('Performing grid search for optimal parameters...', '0% Complete', 100,
                                self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | 
                                wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)

        # Define cross validation parameters
        totalGroups = 5
        trainingGroups = 4

        # Convert the training set into SVM format and search for optimal parameters
        # C and gamma using 5-fold cross-validation
        logging.info('Performing grid search for parameters C and gamma on entire training set...')
        self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix, 
                                  self.classifier.trainingSet.values)
        C, gamma = self.ParameterGridSearch(callback=cb)
        dlg.Destroy()
        logging.info('Grid search completed. Found optimal C=%d and gamma=%f.' % (C, gamma))

        # Create the classifier and initialize misclassification storage
        classifier = Pipeline([('anova', feature_selection.SelectPercentile(feature_selection.f_classif,
                                                                            percentile=self.percentile)),
                               ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1))])
        nObjects = self.classifier.trainingSet.label_matrix.shape[0]
        subsetSize = np.ceil(nObjects / float(totalGroups))
        indices = np.arange(nObjects)
        misclassifications = [[] for i in range(nObjects)]

        # Create group combinations and arrays of all labels and values
        dt = ','.join('i'*trainingGroups)
        trainingTotalGroups = list(np.fromiter(combinations(range(totalGroups),trainingGroups), dtype=dt, count=-1))
        #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups))
        allLabels = np.array(self.svm_train_labels)
        allValues = np.array(self.svm_train_values)

        # For all permutations of the subsets train the classifier on 4 totalGroups and
        # classify the remaining group for a number of random subsets
        logging.info('Calculating average classification accuracy %d times over a ' \
                     '%0.1f%%/%0.1f%% cross-validation process' % \
                     (nPermutations, trainingGroups/float(totalGroups)*100, \
                     (1-trainingGroups/float(totalGroups))*100))
        dlg = wx.ProgressDialog('Calculating average cross-validation accuracy...', '0% Complete', 100,
                                self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | 
                                wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)
        nTrainingTotalGroups = len(trainingTotalGroups)
        nOperations = float(nPermutations * nTrainingTotalGroups)
        for per in range(nPermutations):
            # Split the training set into subsets
            np.random.shuffle(indices)
            lastGroupStart = (totalGroups-1)*subsetSize
            subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups-1))
            subsets.append(indices[lastGroupStart:],)

            for index, group in enumerate(trainingTotalGroups):
                # Retrieve indices of all objects in the training set
                trainingSet = np.hstack([subsets[i] for i in range(totalGroups) if i in group])

                # Train a classifier on the subset
                classifier.fit(allValues[trainingSet], allLabels[trainingSet])

                # Predict the test set using the trained classifier
                testSet = np.hstack([subsets[i] for i in range(totalGroups) if i not in group])
                testLabels = classifier.predict(allValues[testSet])

                # Store all misclassifications
                [misclassifications[testSet[i]].append(testLabels[i]) \
                    for i in range(len(testLabels)) \
                    if testLabels[i] != allLabels[testSet][i]]

                # Update progress dialog
                cb((nTrainingTotalGroups * per + index) / nOperations)

        # Calculate average classification accuracy
        dlg.Destroy()
        logging.info('Average Classification Accuracy: %f%%' % \
                     ((1-len([item for sublist in misclassifications for item in sublist]) /\
                     float(nObjects * nPermutations))*100))

        return misclassifications
    def XValidate(self, nPermutations):
        # Make sure all data is available in the training set
        if not self.classifier.UpdateTrainingSet():
            return

        # Initialize process dialog
        def cb(frac):
            cont, skip = dlg.Update(int(frac * 100.), '%d%% Complete'%(frac * 100.))
            if not cont: # Cancel was pressed
                dlg.Destroy()
                raise StopCalculating()

        dlg = wx.ProgressDialog('Performing grid search for optimal parameters...', '0% Complete', 100,
                                self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | 
                                wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)

        # Define cross validation parameters
        totalGroups = 5
        trainingGroups = 4

        # Convert the training set into SVM format and search for optimal parameters
        # C and gamma using 5-fold cross-validation
        logging.info('Performing grid search for parameters C and gamma on entire training set...')
        self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix, 
                                  self.classifier.trainingSet.values)
        C, gamma = self.ParameterGridSearch(callback=cb)
        dlg.Destroy()
        logging.info('Grid search completed. Found optimal C=%d and gamma=%f.' % (C, gamma))

        # Create the classifier and initialize misclassification storage
        classifier = Pipeline([('anova', feature_selection.SelectPercentile(feature_selection.f_classif,
                                                                            percentile=self.percentile)),
                               ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1))])
        nObjects = self.classifier.trainingSet.label_matrix.shape[0]
        subsetSize = np.ceil(nObjects / float(totalGroups))
        indices = np.arange(nObjects)
        misclassifications = [[] for i in range(nObjects)]

        # Create group combinations and arrays of all labels and values
        dt = ','.join('i'*trainingGroups)
        trainingTotalGroups = list(np.fromiter(combinations(range(totalGroups),trainingGroups), dtype=dt, count=-1))
        #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups))
        allLabels = np.array(self.svm_train_labels)
        allValues = np.array(self.svm_train_values)

        # For all permutations of the subsets train the classifier on 4 totalGroups and
        # classify the remaining group for a number of random subsets
        logging.info('Calculating average classification accuracy %d times over a ' \
                     '%0.1f%%/%0.1f%% cross-validation process' % \
                     (nPermutations, trainingGroups/float(totalGroups)*100, \
                     (1-trainingGroups/float(totalGroups))*100))
        dlg = wx.ProgressDialog('Calculating average cross-validation accuracy...', '0% Complete', 100,
                                self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME | 
                                wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)
        nTrainingTotalGroups = len(trainingTotalGroups)
        nOperations = float(nPermutations * nTrainingTotalGroups)
        for per in range(nPermutations):
            # Split the training set into subsets
            np.random.shuffle(indices)
            lastGroupStart = (totalGroups-1)*subsetSize
            subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups-1))
            subsets.append(indices[lastGroupStart:],)

            for index, group in enumerate(trainingTotalGroups):
                # Retrieve indices of all objects in the training set
                trainingSet = np.hstack([subsets[i] for i in range(totalGroups) if i in group])

                # Train a classifier on the subset
                classifier.fit(allValues[trainingSet], allLabels[trainingSet])

                # Predict the test set using the trained classifier
                testSet = np.hstack([subsets[i] for i in range(totalGroups) if i not in group])
                testLabels = classifier.predict(allValues[testSet])

                # Store all misclassifications
                [misclassifications[testSet[i]].append(testLabels[i]) \
                    for i in range(len(testLabels)) \
                    if testLabels[i] != allLabels[testSet][i]]

                # Update progress dialog
                cb((nTrainingTotalGroups * per + index) / nOperations)

        # Calculate average classification accuracy
        dlg.Destroy()
        logging.info('Average Classification Accuracy: %f%%' % \
                     ((1-len([item for sublist in misclassifications for item in sublist]) /\
                     float(nObjects * nPermutations))*100))

        return misclassifications
Example #18
0
#categories = None

print "Loading 20 newsgroups dataset for categories:"
print categories

data = load_20newsgroups(subset='train', categories=categories)
print "%d documents" % len(data.filenames)
print "%d categories" % len(data.target_names)
print

################################################################################
# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

parameters = {
    # uncommenting more parameters will give better exploring power but will
    # increase processing time in a combinatorial way
    'vect__max_df': (0.5, 0.75, 1.0),
    #    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__analyzer__max_n': (1, 2),  # words or bigrams
    #    'tfidf__use_idf': (True, False),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #    'clf__n_iter': (10, 50, 80),
}
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
y += noise_coef * noise  # add noise

###############################################################################
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(len(y), 2)  # cross-validation generator for model selection
ridge = BayesianRidge()
mem = Memory(cachedir='.', verbose=1)

# Ward agglomeration followed by BayesianRidge
A = grid_to_graph(n_x=size, n_y=size)
ward = WardAgglomeration(n_clusters=10,
                         connectivity=A,
                         memory=mem,
                         n_components=1)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
parameters = {'ward__n_clusters': [10, 20, 30]}
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, parameters, n_jobs=1)
clf.fit(X, y, cv=cv)  # set the best parameters
coef_ = clf.best_estimator.steps[-1][1].coef_
coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

# Anova univariate feature selection followed by BayesianRidge
f_regression = mem.cache(feature_selection.f_regression)  # caching function
anova = feature_selection.SelectPercentile(f_regression)
clf = Pipeline([('anova', anova), ('ridge', ridge)])
parameters = {'anova__percentile': [5, 10, 20]}
# Select the optimal percentage of features with grid search
clf = GridSearchCV(clf, parameters)
Example #20
0
y_train = dataset.target[:n_samples_total / 2]
y_test = dataset.target[n_samples_total / 2:]

# Build a an analyzer that split strings into sequence of 1 to 3 characters
# after using the previous preprocessor
analyzer = CharNGramAnalyzer(
    min_n=1,
    max_n=3,
    preprocessor=LowerCasePreprocessor(),
)

# Build a vectorizer / classifier pipeline using the previous analyzer
clf = Pipeline([
    ('vec', CountVectorizer(analyzer=analyzer)),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
])

# Fit the pipeline on the training set
clf.fit(docs_train, y_train)

# Predict the outcome on the testing set
y_predicted = clf.predict(docs_test)

# Print the classification report
print metrics.classification_report(y_test,
                                    y_predicted,
                                    class_names=dataset.target_names)

# Plot the confusion matrix
y_train = dataset.target[:n_samples_total/2]
y_test = dataset.target[n_samples_total/2:]


# Build a an analyzer that split strings into sequence of 1 to 3 characters
# after using the previous preprocessor
analyzer = CharNGramAnalyzer(
    min_n=1,
    max_n=3,
    preprocessor=LowerCasePreprocessor(),
)

# Build a vectorizer / classifier pipeline using the previous analyzer
clf = Pipeline([
    ('vec', CountVectorizer(analyzer=analyzer)),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
])

# Fit the pipeline on the training set
clf.fit(docs_train, y_train)

# Predict the outcome on the testing set
y_predicted = clf.predict(docs_test)

# Print the classification report
print metrics.classification_report(y_test, y_predicted,
                                    class_names=dataset.target_names)

# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
Example #22
0
# split the dataset in training and test set:
n_samples_total = dataset.filenames.shape[0]

split = (n_samples_total * 3) / 4

docs_train = [open(f).read() for f in dataset.filenames[:split]]
docs_test = [open(f).read() for f in dataset.filenames[split:]]

y_train = dataset.target[:split]
y_test = dataset.target[split:]

# Build a vectorizer / classifier pipeline using the previous analyzer
pipeline = Pipeline([
    ('vect', CountVectorizer(max_features=100000)),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(C=1000)),
])

parameters = {
    'vect__analyzer__max_n': (1, 2),
    'vect__max_df': (.95, ),
}

# Fit the pipeline on the training set using grid search for the parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(docs_train[:200], y_train[:200])

# Refit the best parameter set on the complete training set
clf = grid_search.best_estimator.fit(docs_train, y_train)
import numpy as np
import pylab as pl

from scikits.learn import linear_model, decomposition, datasets, cross_val

logistic = linear_model.LogisticRegression()

pca = decomposition.PCA()
from scikits.learn.pipeline import Pipeline
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

################################################################################
# Plot the PCA spectrum
pca.fit(X_digits)

pl.figure(1, figsize=(4, 3))
pl.clf()
pl.axes([.2, .2, .7, .7])
pl.plot(pca.explained_variance_, linewidth=2)
pl.axis('tight')
pl.xlabel('n_components')
pl.ylabel('explained_variance_')

################################################################################
# Prediction
scores = cross_val.cross_val_score(pipe, X_digits, y_digits, n_jobs=-1)
y = digits.target
# Throw away data, to be in the curse of dimension settings
y = y[:200]
X = digits.data[:200]
n_samples = len(y)
X = X.reshape((n_samples, -1))
# add 200 non-informative features
X = np.hstack((X, 2 * np.random.random((n_samples, 200))))

################################################################################
# Create a feature-selection transform and an instance of SVM that we
# combine together to have an full-blown estimator

transform = feature_selection.SelectPercentile(feature_selection.f_classif)

clf = Pipeline([('anova', transform), ('svc', svm.SVC())])

################################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds = list()
percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)

for percentile in percentiles:
    clf._set_params(anova__percentile=percentile)
    # Compute cross-validation score using all CPUs
    this_scores = cross_val.cross_val_score(clf, X, y, n_jobs=1)
    score_means.append(this_scores.mean())
    score_stds.append(this_scores.std())

pl.errorbar(percentiles, score_means, np.array(score_stds))
class SupportVectorMachines(object):
    '''
    Class to define a complete support vector machine classifier calculation problem. 
    '''
    def __init__(self, classifier=None):
        logging.info('Initialized New Support Vector Machines Classifier')
        self.model = None
        self.classBins = []
        self.classifier = classifier
        self.percentile = 90

        # Initialize the total object storage
        self.perClassObjects = {}
        self.feat_min, self.feat_max = None, None
        self.svm_train_labels, self.svm_train_values = None, None

    def CheckProgress(self):
        # Calculate cross-validation data
        nPermutations = 10
        try:
            misclassifications = self.XValidate(nPermutations)
        except StopCalculating:
            return

        def confusionMatrix():
            # Open confusion matrix
            confusionMatrix, axes = self.ConfusionMatrix(
                self.svm_train_labels, [
                    misclassifications[i] + [val] *
                    (nPermutations - len(misclassifications[i]))
                    for i, val in enumerate(self.svm_train_labels)
                ])
            self.classifier.ShowConfusionMatrix(confusionMatrix, axes)

        def dimensionReduction():
            # Initialize PCA/tSNE plot
            pca_main = dr.PlotMain(self.classifier,
                                   properties=Properties.getInstance(),
                                   loadData=False)
            pca_main.set_data(
                self.classifier.trainingSet.values,
                dict([(index, object) for index, object in enumerate(
                    self.classifier.trainingSet.get_object_keys())]),
                np.int64(self.classifier.trainingSet.label_matrix > 0),
                self.classifier.trainingSet.labels,
                np.array([
                    len(misclassifications[i]) / float(nPermutations)
                    for i in xrange(len(misclassifications))
                ]).round(2))
            pca_main.Show(True)

        # Ask how the user wants to visualize the cross-validation results (either through
        # a confusion matrix or visually in a dimension reductionality plot)
        visualizationChoiceBox(self.classifier, -1,
                               'Pick cross-validation visualization',
                               confusionMatrix, dimensionReduction)

    def ClearModel(self):
        # Clear all parameters related to the trained classifier
        self.classBins = []
        self.model = None
        self.feat_min, self.feat_max = None, None
        self.svm_train_labels, self.svm_train_values = None, None

    def ComplexityTxt(self):
        return '# of cross-validations: '

    def ConfusionMatrix(self, actual=None, predicted=None):
        # Retrieve the number of classes, their labels and initialize
        # the confusion matrix
        nClasses = len(self.classBins)
        confusionMatrix = np.zeros((nClasses, nClasses), np.int64)
        classLabels = [bin.label for bin in self.classBins]

        # For each of the objects used to train the classifier, check what class
        # it was predicted to have been by the classifier
        if actual is None or predicted is None:
            for actualClassNum, actualClassObjects in \
                enumerate([bin.GetObjectKeys() for bin in self.classBins]):
                for predictedLabel in [(classLabels[i], i)
                                       for i in range(nClasses)]:
                    confusionMatrix[predictedLabel[1], actualClassNum] += \
                        len([obj for obj in actualClassObjects if \
                        obj in self.perClassObjects[predictedLabel[0]]])
        else:
            # Generate the confusion matrix for a list of actual and predicted classes
            for i, actualClass in enumerate(actual):
                # Count the number of correct classifications and store them in the
                # confusion matrix
                actualClass = np.int(actualClass)

                # Count all misclassifications
                for j in predicted[i]:
                    confusionMatrix[np.int(j), actualClass] += 1

        return confusionMatrix, classLabels

    def ConvertToSVMFormat(self, labels, values):
        '''
        Convert the training set data to SVM format
        Format: label feature_1:value feature_2:value feature_3:value ...
        '''
        labels = np.array([np.nonzero(target > 0)
                           for target in labels]).squeeze()
        return labels, values

    def CreatePerObjectClassTable(self, classes):
        '''
    	Saves object keys and classes to a SQL table
    	'''
        p = Properties.getInstance()
        if p.class_table is None:
            raise ValueError('"class_table" in properties file is not set.')

        index_cols = dbconnect.UniqueObjectClause()
        class_cols = dbconnect.UniqueObjectClause() + ', class, class_number'
        class_col_defs = dbconnect.object_key_defs(
        ) + ', class VARCHAR (%d)' % (
            max([len(c.label)
                 for c in self.classBins]) + 1) + ', class_number INT'

        # Drop must be explicitly asked for Classifier.ScoreAll
        db = dbconnect.DBConnect.getInstance()
        db.execute('DROP TABLE IF EXISTS %s' % (p.class_table))
        db.execute('CREATE TABLE %s (%s)' % (p.class_table, class_col_defs))
        db.execute('CREATE INDEX idx_%s ON %s (%s)' %
                   (p.class_table, p.class_table, index_cols))
        for clNum, clName in enumerate(self.perClassObjects.keys()):
            for obj in self.perClassObjects[clName]:
                query = ''.join([
                    'INSERT INTO ', p.class_table, ' (', class_cols,
                    ') VALUES (',
                    str(obj[0]), ', ',
                    str(obj[1]), ', "', clName, '", ',
                    str(clNum + 1), ')'
                ])
                db.execute(query)

        if p.db_type.lower() == 'mysql':
            query = ''.join([
                'ALTER TABLE ', p.class_table, ' ORDER BY ', p.image_id,
                ' ASC, ', p.object_id, ' ASC'
            ])
            db.execute(query)
            db.Commit()

    def FilterObjectsFromClassN(self, classN=None, keys=None):
        '''
    	Filter the input objects to output the keys of those in classN, 
    	using a defined SVM model classifier.
    	'''
        # Retrieve instance of the database connection
        db = dbconnect.DBConnect.getInstance()
        object_data = {}
        if isinstance(keys, str):
            object_data[0] = db.GetCellDataForClassifier(keys)
        elif keys != []:
            if len(keys) == len(dbconnect.image_key_columns()):
                # Retrieve instance of the data model and retrieve objects in the requested image
                dm = DataModel.getInstance()
                obKeys = dm.GetObjectsFromImage(keys[0])
            else:
                obKeys = keys
            for key in obKeys:
                object_data[key] = db.GetCellDataForClassifier(key)

        sorted_keys = sorted(object_data.keys())
        values_array = np.array([object_data[key] for key in sorted_keys])
        scaled_values = self.ScaleData(values_array)
        pred_labels = self.model.predict(scaled_values)

        # Group the object keys per class
        classObjects = {}
        for index in range(1, len(self.classBins) + 1):
            classObjects[float(index)] = []
        for index, label in enumerate(pred_labels):
            classObjects[np.int(label) + 1].append(sorted_keys[index])

        # Return either a summary of all classes and their corresponding objects
        # or just the objects for a specific class
        if classN is None:
            return classObjects
        else:
            return classObjects[classN]

    def IsTrained(self):
        return self.model is not None

    def LinearScale(self, value, low_lim, up_lim, feat_min, feat_max):
        return low_lim + (up_lim - low_lim) * (value - feat_min) / (feat_max -
                                                                    feat_min)

    def LoadModel(self, model_file_name):
        import cPickle
        fh = open(model_file_name, 'r')
        try:
            self.model, self.bin_labels, self.feat_min, self.feat_max = cPickle.load(
                fh)
        except:
            self.model = None
            self.bin_labels = None
            self.feat_min = None
            self.feat_max = None
            logging.error(
                'The loaded model was not a support vector machines model')
            raise TypeError
        finally:
            fh.close()

    def ParameterGridSearch(self, callback=None, nValidation=5):
        '''
        Grid search for the best C and gamma parameters for the RBF Kernel.
        The efficiency of the parameters is evaluated using nValidation-fold
        cross-validation of the training data.
    
        As this process is time consuming and parallelizable, a number of
        threads equal to the number of cores in the computer is used for the
        calculations
        '''
        from scikits.learn.grid_search import GridSearchCV
        from scikits.learn.metrics import precision_score
        from scikits.learn.cross_val import StratifiedKFold
        #
        # XXX: program crashes with >1 worker when running cpa.py
        #      No crash when running from classifier.py. Why?
        #
        n_workers = 1
        #try:
        #from multiprocessing import cpu_count
        #n_workers = cpu_count()
        #except:
        #n_workers = 1

        # Define the parameter ranges for C and gamma and perform a grid search for the optimal setting
        parameters = {
            'C': 2**np.arange(-5, 11, 2, dtype=float),
            'gamma': 2**np.arange(3, -11, -2, dtype=float)
        }
        clf = GridSearchCV(SVC(kernel='rbf'),
                           parameters,
                           n_jobs=n_workers,
                           score_func=precision_score)
        clf.fit(self.svm_train_values,
                self.svm_train_labels,
                cv=StratifiedKFold(self.svm_train_labels, nValidation))

        # Pick the best parameters as the ones with the maximum cross-validation rate
        bestParameters = max(clf.grid_scores_, key=lambda a: a[1])
        bestC = bestParameters[0]['C']
        bestGamma = bestParameters[0]['gamma']
        logging.info('Optimal values: C=%s g=%s rate=%s' %
                     (bestC, bestGamma, bestParameters[1]))
        return bestC, bestGamma

    def PerImageCounts(self, filter_name=None, cb=None):
        # Clear the current perClassObjects storage
        for bin in self.classBins:
            self.perClassObjects[bin.label] = []

        # Retrieve a data model instance
        dm = DataModel.getInstance()

        # Retrieve image keys and initialize variables
        imageKeys = dm.GetAllImageKeys(filter_name)
        imageAmount = float(len(imageKeys))
        perImageData = []

        # Process all images
        for k_index, imKey in enumerate(imageKeys):
            try:
                # Retrieve the keys of the objects in the current image
                obKeys = dm.GetObjectsFromImage(imKey)
            except:
                raise ValueError('No such image: %s' % (imKey, ))

            # Calculate the amount of hits for each of the classes in the current image
            classHits = {}
            objectCount = [imKey[0]]
            if obKeys:
                classObjects = self.FilterObjectsFromClassN(keys=[imKey])
                for clNum, bin in enumerate(self.classBins):
                    # Get the objects from the image which belong to the selected class
                    classHits[bin.label] = classObjects[float(clNum + 1)]

                    # Store the total object count of this class for the current image
                    nrHits = len(classHits[bin.label])
                    objectCount.append(nrHits)

                    # Store the objects for the current class and image grouped
                    # by class if any are found for this class in the selected image
                    if nrHits > 0:
                        self.perClassObjects[bin.label] += classHits[bin.label]
            else:
                # If there are objects in the image, add zeros for all bins
                [objectCount.append(0) for bin in self.classBins]

            # Store the results for the current image and update the callback
            # function if available
            perImageData.append(objectCount)
            if cb:
                cb(min(1, k_index / imageAmount))

        return perImageData

    def SaveModel(self, model_file_name, bin_labels):
        import cPickle
        fh = open(model_file_name, 'w')
        cPickle.dump((self.model, bin_labels, self.feat_min, self.feat_max),
                     fh)
        fh.close()

    def ScaleData(self, values, low_lim=0.0, up_lim=1.0):
        '''
    	Linearly scale the data to improve the efficiency of the classifier
    	'''
        row, col = np.shape(values)
        scaled_data = np.zeros((row, col))
        for j in xrange(col):
            scaled_data[:, j] = self.LinearScale(values[:, j], low_lim, up_lim,
                                                 self.feat_min[j],
                                                 self.feat_max[j])
        return scaled_data

    def ShowModel(self):
        if self.model is not None:
            return 'Trained the following support vector machines classifier:\n%s' % self.model.named_steps[
                'svc']
        else:
            return ''

    def Train(self,
              colNames,
              nValidation,
              labels,
              values,
              fout=None,
              callback=None):
        '''
    	Train a SVM model using optimized C and Gamma parameters and a training set.
    	'''
        # First make sure the supplied problem is in SVM format
        self.TranslateTrainingSet(labels, values)

        # Perform a grid-search to obtain the C and gamma parameters for C-SVM
        # classification
        if nValidation > 1:
            C, gamma = self.ParameterGridSearch(callback, nValidation)
        else:
            C, gamma = self.ParameterGridSearch(callback)

        # Train the model using the obtained C and gamma parameters to obtain the final classifier
        self.model = Pipeline([
            ('anova',
             feature_selection.SelectPercentile(feature_selection.f_classif,
                                                percentile=self.percentile)),
            ('svc', SVC(kernel='rbf', C=C, gamma=gamma, tol=0.1))
        ])
        self.model.fit(self.svm_train_values, self.svm_train_labels)

    def TranslateTrainingSet(self, labels, values):
        '''
    	Translate and scale CPAnalyst Classifier training set labels and values
    	to the SVM problem format.
    	'''
        adata = np.nan_to_num(np.array(values))
        self.feat_min = adata.min(axis=0)
        self.feat_max = adata.max(axis=0)
        self.feat_min[0] = 0.0
        values = self.ScaleData(adata)
        self.svm_train_labels, self.svm_train_values = self.ConvertToSVMFormat(
            labels, values)

    def UpdateBins(self, classBins):
        self.classBins = classBins

        # Reinitialize the objects per class storage
        self.perClassObjects = {}
        for bin in self.classBins:
            self.perClassObjects[bin.label] = []

    def XValidate(self, nPermutations):
        # Make sure all data is available in the training set
        if not self.classifier.UpdateTrainingSet():
            return

        # Initialize process dialog
        def cb(frac):
            cont, skip = dlg.Update(int(frac * 100.),
                                    '%d%% Complete' % (frac * 100.))
            if not cont:  # Cancel was pressed
                dlg.Destroy()
                raise StopCalculating()

        dlg = wx.ProgressDialog(
            'Performing grid search for optimal parameters...', '0% Complete',
            100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME
            | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)

        # Define cross validation parameters
        totalGroups = 5
        trainingGroups = 4

        # Convert the training set into SVM format and search for optimal parameters
        # C and gamma using 5-fold cross-validation
        logging.info(
            'Performing grid search for parameters C and gamma on entire training set...'
        )
        self.TranslateTrainingSet(self.classifier.trainingSet.label_matrix,
                                  self.classifier.trainingSet.values)
        C, gamma = self.ParameterGridSearch(callback=cb)
        dlg.Destroy()
        logging.info(
            'Grid search completed. Found optimal C=%d and gamma=%f.' %
            (C, gamma))

        # Create the classifier and initialize misclassification storage
        classifier = Pipeline([
            ('anova',
             feature_selection.SelectPercentile(feature_selection.f_classif,
                                                percentile=self.percentile)),
            ('svc', SVC(kernel='rbf', C=C, gamma=gamma, eps=0.1))
        ])
        nObjects = self.classifier.trainingSet.label_matrix.shape[0]
        subsetSize = np.ceil(nObjects / float(totalGroups))
        indices = np.arange(nObjects)
        misclassifications = [[] for i in range(nObjects)]

        # Create group combinations and arrays of all labels and values
        dt = ','.join('i' * trainingGroups)
        trainingTotalGroups = list(
            np.fromiter(combinations(range(totalGroups), trainingGroups),
                        dtype=dt,
                        count=-1))
        #trainingTotalGroups = list(combinations(range(totalGroups), trainingGroups))
        allLabels = np.array(self.svm_train_labels)
        allValues = np.array(self.svm_train_values)

        # For all permutations of the subsets train the classifier on 4 totalGroups and
        # classify the remaining group for a number of random subsets
        logging.info('Calculating average classification accuracy %d times over a ' \
                     '%0.1f%%/%0.1f%% cross-validation process' % \
                     (nPermutations, trainingGroups/float(totalGroups)*100, \
                     (1-trainingGroups/float(totalGroups))*100))
        dlg = wx.ProgressDialog(
            'Calculating average cross-validation accuracy...', '0% Complete',
            100, self.classifier, wx.PD_ELAPSED_TIME | wx.PD_ESTIMATED_TIME
            | wx.PD_REMAINING_TIME | wx.PD_CAN_ABORT)
        nTrainingTotalGroups = len(trainingTotalGroups)
        nOperations = float(nPermutations * nTrainingTotalGroups)
        for per in range(nPermutations):
            # Split the training set into subsets
            np.random.shuffle(indices)
            lastGroupStart = (totalGroups - 1) * subsetSize
            subsets = np.hsplit(indices[0:lastGroupStart], (totalGroups - 1))
            subsets.append(indices[lastGroupStart:], )

            for index, group in enumerate(trainingTotalGroups):
                # Retrieve indices of all objects in the training set
                trainingSet = np.hstack(
                    [subsets[i] for i in range(totalGroups) if i in group])

                # Train a classifier on the subset
                classifier.fit(allValues[trainingSet], allLabels[trainingSet])

                # Predict the test set using the trained classifier
                testSet = np.hstack(
                    [subsets[i] for i in range(totalGroups) if i not in group])
                testLabels = classifier.predict(allValues[testSet])

                # Store all misclassifications
                [misclassifications[testSet[i]].append(testLabels[i]) \
                    for i in range(len(testLabels)) \
                    if testLabels[i] != allLabels[testSet][i]]

                # Update progress dialog
                cb((nTrainingTotalGroups * per + index) / nOperations)

        # Calculate average classification accuracy
        dlg.Destroy()
        logging.info('Average Classification Accuracy: %f%%' % \
                     ((1-len([item for sublist in misclassifications for item in sublist]) /\
                     float(nObjects * nPermutations))*100))

        return misclassifications
y = np.dot(X, coef.ravel())
noise = np.random.randn(y.shape[0])
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
y += noise_coef * noise # add noise

###############################################################################
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(len(y), 2) # cross-validation generator for model selection
ridge = BayesianRidge()
mem = Memory(cachedir='.', verbose=1)

# Ward agglomeration followed by BayesianRidge
A = grid_to_graph(n_x=size, n_y=size)
ward = WardAgglomeration(n_clusters=10, connectivity=A, memory=mem,
                         n_components=1)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1)
clf.fit(X, y, cv=cv) # set the best parameters
coef_ = clf.best_estimator.steps[-1][1].coef_
coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

# Anova univariate feature selection followed by BayesianRidge
f_regression = mem.cache(feature_selection.f_regression) # caching function
anova = feature_selection.SelectPercentile(f_regression)
clf = Pipeline([('anova', anova), ('ridge', ridge)])
# Select the optimal percentage of features with grid search
clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]})
clf.fit(X, y, cv=cv) # set the best parameters
coef_ = clf.best_estimator.steps[-1][1].coef_
"""
==================
Pipeline Anova SVM
==================

Simple usage of Pipeline that runs successively a univariate
feature selection with anova and then a C-SVM of the selected features.
"""
print __doc__

from scikits.learn import svm
from scikits.learn.datasets import samples_generator
from scikits.learn.feature_selection import SelectKBest, f_regression
from scikits.learn.pipeline import Pipeline

# import some data to play with
X, y = samples_generator.make_classification(
	n_features=20, n_informative=3, n_redundant=0, 
	n_classes=4, n_clusters_per_class=2)

# ANOVA SVM-C
# 1) anova filter, take 3 best ranked features
anova_filter = SelectKBest(f_regression, k=3)
# 2) svm
clf = svm.SVC(kernel='linear')

anova_svm = Pipeline([('anova', anova_filter), ('svm', clf)])
anova_svm.fit(X, y)
anova_svm.predict(X)

Example #28
0
"""
==================
Pipeline Anova SVM
==================

Simple usage of Pipeline that runs successively a univariate
feature selection with anova and then a C-SVM of the selected features.
"""
print __doc__

from scikits.learn import svm
from scikits.learn.datasets import samples_generator
from scikits.learn.feature_selection import SelectKBest, f_regression
from scikits.learn.pipeline import Pipeline

# import some data to play with
X, y = samples_generator.test_dataset_classif(k=5)

# ANOVA SVM-C
# 1) anova filter, take 5 best ranked features
anova_filter = SelectKBest(f_regression, k=5)
# 2) svm
clf = svm.SVC(kernel='linear')

anova_svm = Pipeline([('anova', anova_filter), ('svm', clf)])
anova_svm.fit(X, y)
anova_svm.predict(X)

Example #29
0
y = digits.target
# Throw away data, to be in the curse of dimension settings
y = y[:200]
X = digits.data[:200]
n_samples = len(y)
X = X.reshape((n_samples, -1))
# add 200 non-informative features
X = np.hstack((X, 2*np.random.random((n_samples, 200))))

################################################################################
# Create a feature-selection transform and an instance of SVM that we
# combine together to have an full-blown estimator

transform = feature_selection.SelectPercentile(feature_selection.f_classif)

clf = Pipeline([('anova', transform), ('svc', svm.SVC())])

################################################################################
# Plot the cross-validation score as a function of percentile of features
score_means = list()
score_stds  = list()
percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)

for percentile in percentiles:
    clf._set_params(anova__percentile=percentile)
    # Compute cross-validation score using all CPUs
    this_scores = cross_val.cross_val_score(clf, X, y, n_jobs=1)
    score_means.append(this_scores.mean())
    score_stds.append(this_scores.std())

pl.errorbar(percentiles, score_means, np.array(score_stds))
Example #30
0
import numpy as np
import matplotlib.pyplot as pl
from scikits.learn.decomposition import RandomizedPCA
from scikits.learn.svm import LinearSVC
from scikits.learn.pipeline import Pipeline
from scikits.learn.grid_search import GridSearchCV
from scikits.learn.metrics import classification_report
from preprocess import InfinitivesExtractor, load_data

# Data attributes
targets = [0, 1, 2]
target_names = ["covered", "no alternance", "uncovered"]
target_colors = "rgb"

# Classification settings
pipeline = Pipeline([('extr', InfinitivesExtractor()),
                     ('svc', LinearSVC(multi_class=True))])
parameters = {
    'extr__count': (True, False),
    'extr__n': (3, 4, 5, 6),
    'svc__C': (1e-1, 1e-2, 1e9)
}
grid_search = GridSearchCV(pipeline, parameters)

print "Loading data..."
X, y = load_data()
print "Searching for the best model..."
t0 = time()
grid_search.fit(X, y)
print "Done in %0.3f" % (time() - t0)
print "Best score: %0.3f" % grid_search.best_score
clf = grid_search.best_estimator