Example #1
0
    def selectFeatures(self, _data, *options, **args) :

        self.eliminated = []
        self.measures = []
        cvArgs = {}
        import re
        rocExp = re.compile(r"roc(?P<rocN>[0-9]+)area")
        match = rocExp.match(self.measure)
        if match is not None :
            measureStr = 'rocNarea'
            cvArgs['rocN'] = match.groupdict()['rocN']
        else :
            measureStr = self.measure
            
        data = _data.__class__(_data, deepcopy = True)
        for i in range(self.targetNumFeatures, _data.numFeatures) :
            maxScore = 0
            # loop over the CURRENT features
            for feature in range(data.numFeatures) :
                featureName = data.featureID[feature]
                data.eliminateFeatures([feature])
                res = self.classifier.stratifiedCV(data, **cvArgs)
                score = getattr(res, measureStr)
                if score > maxScore :
                    maxScore = score
                    bestFeatureName = featureName
                data = _data.__class__(_data, deepcopy = True)
                data.eliminateFeatures(data.featureNames2IDs(self.eliminated))
            data = _data.__class__(_data, deepcopy = True)
            self.eliminated.append(bestFeatureName)
            data.eliminateFeatures(data.featureNames2IDs(self.eliminated))
            self.measures.append(maxScore)

        return misc.setminus(range(_data.numFeatures),
                             _data.featureNames2IDs(self.eliminated))
Example #2
0
    def selectFeatures(self, _data, *options, **args):

        self.eliminated = []
        self.measures = []
        cvArgs = {}
        import re
        rocExp = re.compile(r"roc(?P<rocN>[0-9]+)area")
        match = rocExp.match(self.measure)
        if match is not None:
            measureStr = 'rocNarea'
            cvArgs['rocN'] = match.groupdict()['rocN']
        else:
            measureStr = self.measure

        data = _data.__class__(_data, deepcopy=True)
        for i in range(self.targetNumFeatures, _data.numFeatures):
            maxScore = 0
            # loop over the CURRENT features
            for feature in range(data.numFeatures):
                featureName = data.featureID[feature]
                data.eliminateFeatures([feature])
                res = self.classifier.stratifiedCV(data, **cvArgs)
                score = getattr(res, measureStr)
                if score > maxScore:
                    maxScore = score
                    bestFeatureName = featureName
                data = _data.__class__(_data, deepcopy=True)
                data.eliminateFeatures(data.featureNames2IDs(self.eliminated))
            data = _data.__class__(_data, deepcopy=True)
            self.eliminated.append(bestFeatureName)
            data.eliminateFeatures(data.featureNames2IDs(self.eliminated))
            self.measures.append(maxScore)

        return misc.setminus(range(_data.numFeatures),
                             _data.featureNames2IDs(self.eliminated))
Example #3
0
def makeFolds(data, numFolds, datasetName, directory = '.') :

    '''split a dataset into several folds and save the training and testing
    data of each fold as a separate dataset

    data - a dataset instance
    numfolds - number of folds into which to split the data
    datasetName - string to use for the file names
    directory - the directory into which to deposit the files
    '''
        
    perm = range(len(data))
    random.shuffle(perm)
    foldSize = len(data) / numFolds
    
    for fold in range(numFolds) :
        if fold < numFolds-1:
            testingPatterns = perm[foldSize * fold : foldSize * (fold + 1)]
        else:
            testingPatterns = perm[foldSize * fold : len(data)]
        trainingPatterns = misc.setminus(range(len(data)), testingPatterns)

        trainingData = data.__class__(data, patterns = trainingPatterns)
        testingData = data.__class__(data, patterns = testingPatterns)

        testingDataName = os.path.join(directory, datasetName + 'Testing' + str(fold) + '.data')
        testingData.save(testingDataName)
        trainingDataName = os.path.join(directory, datasetName + 'Training' + str(fold) + '.data')
        trainingData.save(trainingDataName)
Example #4
0
def makeFolds(data, numFolds, datasetName, directory='.'):
    '''split a dataset into several folds and save the training and testing
    data of each fold as a separate dataset

    data - a dataset instance
    numfolds - number of folds into which to split the data
    datasetName - string to use for the file names
    directory - the directory into which to deposit the files
    '''

    perm = range(len(data))
    random.shuffle(perm)
    foldSize = len(data) / numFolds

    for fold in range(numFolds):
        if fold < numFolds - 1:
            testingPatterns = perm[foldSize * fold:foldSize * (fold + 1)]
        else:
            testingPatterns = perm[foldSize * fold:len(data)]
        trainingPatterns = misc.setminus(range(len(data)), testingPatterns)

        trainingData = data.__class__(data, patterns=trainingPatterns)
        testingData = data.__class__(data, patterns=testingPatterns)

        testingDataName = os.path.join(
            directory, datasetName + 'Testing' + str(fold) + '.data')
        testingData.save(testingDataName)
        trainingDataName = os.path.join(
            directory, datasetName + 'Training' + str(fold) + '.data')
        trainingData.save(trainingDataName)
    def keepFeatures(self, features) :
        """eliminate all but the give list of features
        INPUT:
        features - a list of features to eliminate; these are either numbers
        between 0 and numFeatures-1 (indices of features, not their IDs) or
        featureIDs
        """

        if type(features[0]) == type('') :
            features = self.featureNames2IDs(features)
        self.eliminateFeatures(misc.setminus(range(self.numFeatures), features))
    def keepFeatures(self, features):
        """eliminate all but the give list of features
        INPUT:
        features - a list of features to eliminate; these are either numbers
        between 0 and numFeatures-1 (indices of features, not their IDs) or
        featureIDs
        """

        if type(features[0]) == type(''):
            features = self.featureNames2IDs(features)
        self.eliminateFeatures(misc.setminus(range(self.numFeatures),
                                             features))
Example #7
0
    def selectFeatures(self, data, targetClass=None, otherClass = None, *options, **args) :

        s = self.featureScore.score(data, targetClass, otherClass, **args) 

        if self.mode == "byNum" :
            featuresToEliminate = numpy.argsort(s)\
                                  [:data.numFeatures - self.numFeatures]
        elif self.mode == "byThreshold" :
            featuresToEliminate = numpy.nonzero(numpy.less(s, self.threshold))[0]
        elif self.mode == "bySignificance" :
            t = self.significanceThreshold(data)
            self.thresholds = t
            featuresToEliminate = numpy.nonzero(numpy.less(s, t))[0]
        else :
            raise ValueError, 'unknown elimination mode in filter'

        return misc.setminus(range(data.numFeatures), featuresToEliminate)
Example #8
0
    def eliminateFeatures(self, featureList):
        """eliminate a list of features from a dataset
        Input:
        featureList - a list of features to eliminate; these are numbers
        between 0 and numFeatures-1 (indices of features, not their IDs)"""

        #if len(featureList) == 0 : return
        #if type(featureList[0]) == type('') :
        #    featureList = self.featureNames2IDs(features)
        featuresToTake = misc.setminus(range(self.numFeatures), featureList)
        featuresToTake.sort()
        self.featureID = [self.featureID[i] for i in featuresToTake]
        #self.featureKey = [self.featureKey[i] for i in featuresToTake]
        #self.featureKeyDict = {}
        #for i in range(len(self.featureKey)) :
        #    self.featureKeyDict[self.featureKey[i]] = i

        self.X = numpy.take(self.X, featuresToTake, 1)
Example #9
0
    def eliminateFeatures(self, featureList) :
        """eliminate a list of features from a dataset
        Input:
        featureList - a list of features to eliminate; these are numbers
        between 0 and numFeatures-1 (indices of features, not their IDs)"""

        if len(featureList) == 0 : return
        if type(featureList[0]) == type('') :
            featureList = self.featureNames2IDs(features)
        featuresToTake = misc.setminus(range(self.numFeatures), featureList)
        featuresToTake.sort()
        self.featureID = [self.featureID[i] for i in featuresToTake]
        self.featureKey = [self.featureKey[i] for i in featuresToTake]
        self.featureKeyDict = {}
        for i in range(len(self.featureKey)) :
            self.featureKeyDict[self.featureKey[i]] = i        
        
        self.X = numpy.take(self.X, featuresToTake, 1)
Example #10
0
def cv(classifier, data, numFolds=5, **args):
    """perform k-fold cross validation

    :Parameters:
      - `classifier` - a classifier template
      - `data` - a dataset
      - `numFolds` - number of cross validation folds (default = 5)

    :Returns:
      a Results object.

    :Keywords:
      - `numFolds` - number of cross validation folds (default = 5)
      - `seed` - random number generator seed
      - `foldsToPerform` - number of folds to actually perform (in case you're doing
        n fold CV, and want to save time, and only do some of the folds)
    """

    if 'numFolds' in args:
        numFolds = args['numFolds']
    if 'seed' in args:
        random.seed(args['seed'])
    foldsToPerform = numFolds
    if 'foldsToPerform' in args:
        foldsToPerform = args['foldsToPerform']
    if foldsToPerform > numFolds:
        raise ValueError, 'foldsToPerform > numFolds'

    perm = range(len(data))
    random.shuffle(perm)
    foldSize = len(data) / numFolds
    trainingPatterns = []
    testingPatterns = []

    for fold in range(foldsToPerform):
        if fold < numFolds - 1:
            testingPatterns.append(perm[foldSize * fold:foldSize * (fold + 1)])
        else:
            testingPatterns.append(perm[foldSize * fold:len(data)])
        trainingPatterns.append(
            misc.setminus(range(len(data)), testingPatterns[-1]))

    return cvFromFolds(classifier, data, trainingPatterns, testingPatterns,
                       **args)
Example #11
0
def cv(classifier, data, numFolds = 5, **args) :
    """perform k-fold cross validation

    :Parameters:
      - `classifier` - a classifier template
      - `data` - a dataset
      - `numFolds` - number of cross validation folds (default = 5)

    :Returns:
      a Results object.

    :Keywords:
      - `numFolds` - number of cross validation folds (default = 5)
      - `seed` - random number generator seed
      - `foldsToPerform` - number of folds to actually perform (in case you're doing
        n fold CV, and want to save time, and only do some of the folds)
    """

    if 'numFolds' in args :
        numFolds = args['numFolds']
    if 'seed' in args :
        random.seed(args['seed'])
    foldsToPerform = numFolds
    if 'foldsToPerform' in args :
        foldsToPerform = args['foldsToPerform']
    if foldsToPerform > numFolds :
        raise ValueError, 'foldsToPerform > numFolds'

    perm = range(len(data))
    random.shuffle(perm)
    foldSize = len(data) / numFolds
    trainingPatterns = []
    testingPatterns = []

    for fold in range(foldsToPerform) :
        if fold < numFolds-1:
            testingPatterns.append(perm[foldSize * fold : foldSize * (fold + 1)])
        else:
            testingPatterns.append(perm[foldSize * fold : len(data)])
        trainingPatterns.append(misc.setminus(range(len(data)),
                                              testingPatterns[-1]))
        
    return cvFromFolds(classifier, data, trainingPatterns, testingPatterns, **args)
Example #12
0
def loo(classifier, data, **args) :
    """perform Leave One Out 

    :Returns:
      a results object

    USAGE: loo(classifier, data)
    """

    looResults = classifier.resultsObject()
    args['stats'] = False

    for i in range(len(data)) :
        trainingPatterns = misc.setminus(range(len(data)), [i])
        looResults.extend(
            classifier.trainTest(data, trainingPatterns, [i], **args))

    looResults.computeStats()
   
    return looResults
Example #13
0
def loo(classifier, data, **args):
    """perform Leave One Out 

    :Returns:
      a results object

    USAGE: loo(classifier, data)
    """

    looResults = classifier.resultsObject()
    args['stats'] = False

    for i in range(len(data)):
        trainingPatterns = misc.setminus(range(len(data)), [i])
        looResults.extend(
            classifier.trainTest(data, trainingPatterns, [i], **args))

    looResults.computeStats()

    return looResults
Example #14
0
    def selectFeatures(self,
                       data,
                       targetClass=None,
                       otherClass=None,
                       *options,
                       **args):

        s = self.featureScore.score(data, targetClass, otherClass, **args)

        if self.mode == "byNum":
            featuresToEliminate = numpy.argsort(s)\
                                  [:data.numFeatures - self.numFeatures]
        elif self.mode == "byThreshold":
            featuresToEliminate = numpy.nonzero(numpy.less(s,
                                                           self.threshold))[0]
        elif self.mode == "bySignificance":
            t = self.significanceThreshold(data)
            self.thresholds = t
            featuresToEliminate = numpy.nonzero(numpy.less(s, t))[0]
        else:
            raise ValueError, 'unknown elimination mode in filter'

        return misc.setminus(range(data.numFeatures), featuresToEliminate)
Example #15
0
def split(data, fraction, **args):
    """
    split a dataset into training and test sets.
    randomly splits a dataset into two datasets whose sizes are determined
    by the 'fraction' parameter (the first dataset will contain that fraction
    of the examples).

    for example:
    train, test = split(data, 0.7)
    will split the data -- 70% for training and 30% for test

    :Parameters:
      - `data` - a dataset object
      - `fraction` - the fraction of the examples to put in the first split

    :Keywords:
      - `stratified` - whether to perform stratified splitting, i.e. whether to 
        keep the class ratio in the two datasets [default: True]
      - `seed` - random number generator seed
      - `indicesOnly` - if this flag is set, the indices of the two splits are
        returned instead of the datasets [default: False]
    """

    if 'seed' in args:
        seed = args['seed']
        rand = random.Random(seed)
    else:
        rand = random.Random()

    indicesOnly = False
    if 'indicesOnly' in args:
        indicesOnly = args['indicesOnly']

    if data.__class__.__name__ == 'Labels':
        labels = data
    else:
        labels = data.labels

    sampleSize = int(len(data) * fraction)

    stratified = True
    if 'stratified' in args:
        stratified = args['stratified']

    if stratified:
        patterns = []
        for i in range(labels.numClasses):
            if i < labels.numClasses - 1:
                numToSample = int(fraction * labels.classSize[i])
            else:
                numToSample = sampleSize - len(patterns)
            I = labels.classes[i][:]
            rand.shuffle(I)
            patterns.extend(I[:numToSample])
    else:
        I = range(len(data))
        rand.shuffle(I)
        patterns = I[:sampleSize]
    patterns.sort()

    if not indicesOnly:
        return (data.__class__(data, patterns=patterns),
                data.__class__(data,
                               patterns=misc.setminus(range(len(data)),
                                                      patterns)))
    else:
        return patterns, misc.setminus(range(len(data)), patterns)
Example #16
0
def split(data, fraction, **args) :
    """
    split a dataset into training and test sets.
    randomly splits a dataset into two datasets whose sizes are determined
    by the 'fraction' parameter (the first dataset will contain that fraction
    of the examples).

    for example:
    train, test = split(data, 0.7)
    will split the data -- 70% for training and 30% for test

    :Parameters:
      - `data` - a dataset object
      - `fraction` - the fraction of the examples to put in the first split

    :Keywords:
      - `stratified` - whether to perform stratified splitting, i.e. whether to 
        keep the class ratio in the two datasets [default: True]
      - `seed` - random number generator seed
      - `indicesOnly` - if this flag is set, the indices of the two splits are
        returned instead of the datasets [default: False]
    """

    if 'seed' in args :
        seed = args['seed']
        rand = random.Random(seed)
    else :
        rand = random.Random()

    indicesOnly = False
    if 'indicesOnly' in args :
        indicesOnly = args['indicesOnly']

    if data.__class__.__name__ == 'Labels' :
        labels = data
    else :
        labels = data.labels

    sampleSize = int(len(data) * fraction)

    stratified = True
    if 'stratified' in args :
        stratified = args['stratified']

    if stratified :
        patterns = []
	for i in range(labels.numClasses) :
	    if i < labels.numClasses - 1 :
                numToSample = int(fraction * labels.classSize[i])
	    else :
                numToSample = sampleSize - len(patterns)
            I = labels.classes[i][:]
            rand.shuffle(I)
            patterns.extend(I[:numToSample])
    else :
        I = range(len(data))
        rand.shuffle(I)
        patterns = I[:sampleSize]
    patterns.sort()
        
    if not indicesOnly :
        return (data.__class__(data, patterns = patterns), 
                data.__class__(data, patterns = misc.setminus(range(len(data)), patterns) ) )
    else :
        return patterns, misc.setminus(range(len(data)), patterns)