def selectFeatures(self, _data, *options, **args) : self.eliminated = [] self.measures = [] cvArgs = {} import re rocExp = re.compile(r"roc(?P<rocN>[0-9]+)area") match = rocExp.match(self.measure) if match is not None : measureStr = 'rocNarea' cvArgs['rocN'] = match.groupdict()['rocN'] else : measureStr = self.measure data = _data.__class__(_data, deepcopy = True) for i in range(self.targetNumFeatures, _data.numFeatures) : maxScore = 0 # loop over the CURRENT features for feature in range(data.numFeatures) : featureName = data.featureID[feature] data.eliminateFeatures([feature]) res = self.classifier.stratifiedCV(data, **cvArgs) score = getattr(res, measureStr) if score > maxScore : maxScore = score bestFeatureName = featureName data = _data.__class__(_data, deepcopy = True) data.eliminateFeatures(data.featureNames2IDs(self.eliminated)) data = _data.__class__(_data, deepcopy = True) self.eliminated.append(bestFeatureName) data.eliminateFeatures(data.featureNames2IDs(self.eliminated)) self.measures.append(maxScore) return misc.setminus(range(_data.numFeatures), _data.featureNames2IDs(self.eliminated))
def selectFeatures(self, _data, *options, **args): self.eliminated = [] self.measures = [] cvArgs = {} import re rocExp = re.compile(r"roc(?P<rocN>[0-9]+)area") match = rocExp.match(self.measure) if match is not None: measureStr = 'rocNarea' cvArgs['rocN'] = match.groupdict()['rocN'] else: measureStr = self.measure data = _data.__class__(_data, deepcopy=True) for i in range(self.targetNumFeatures, _data.numFeatures): maxScore = 0 # loop over the CURRENT features for feature in range(data.numFeatures): featureName = data.featureID[feature] data.eliminateFeatures([feature]) res = self.classifier.stratifiedCV(data, **cvArgs) score = getattr(res, measureStr) if score > maxScore: maxScore = score bestFeatureName = featureName data = _data.__class__(_data, deepcopy=True) data.eliminateFeatures(data.featureNames2IDs(self.eliminated)) data = _data.__class__(_data, deepcopy=True) self.eliminated.append(bestFeatureName) data.eliminateFeatures(data.featureNames2IDs(self.eliminated)) self.measures.append(maxScore) return misc.setminus(range(_data.numFeatures), _data.featureNames2IDs(self.eliminated))
def makeFolds(data, numFolds, datasetName, directory = '.') : '''split a dataset into several folds and save the training and testing data of each fold as a separate dataset data - a dataset instance numfolds - number of folds into which to split the data datasetName - string to use for the file names directory - the directory into which to deposit the files ''' perm = range(len(data)) random.shuffle(perm) foldSize = len(data) / numFolds for fold in range(numFolds) : if fold < numFolds-1: testingPatterns = perm[foldSize * fold : foldSize * (fold + 1)] else: testingPatterns = perm[foldSize * fold : len(data)] trainingPatterns = misc.setminus(range(len(data)), testingPatterns) trainingData = data.__class__(data, patterns = trainingPatterns) testingData = data.__class__(data, patterns = testingPatterns) testingDataName = os.path.join(directory, datasetName + 'Testing' + str(fold) + '.data') testingData.save(testingDataName) trainingDataName = os.path.join(directory, datasetName + 'Training' + str(fold) + '.data') trainingData.save(trainingDataName)
def makeFolds(data, numFolds, datasetName, directory='.'): '''split a dataset into several folds and save the training and testing data of each fold as a separate dataset data - a dataset instance numfolds - number of folds into which to split the data datasetName - string to use for the file names directory - the directory into which to deposit the files ''' perm = range(len(data)) random.shuffle(perm) foldSize = len(data) / numFolds for fold in range(numFolds): if fold < numFolds - 1: testingPatterns = perm[foldSize * fold:foldSize * (fold + 1)] else: testingPatterns = perm[foldSize * fold:len(data)] trainingPatterns = misc.setminus(range(len(data)), testingPatterns) trainingData = data.__class__(data, patterns=trainingPatterns) testingData = data.__class__(data, patterns=testingPatterns) testingDataName = os.path.join( directory, datasetName + 'Testing' + str(fold) + '.data') testingData.save(testingDataName) trainingDataName = os.path.join( directory, datasetName + 'Training' + str(fold) + '.data') trainingData.save(trainingDataName)
def keepFeatures(self, features) : """eliminate all but the give list of features INPUT: features - a list of features to eliminate; these are either numbers between 0 and numFeatures-1 (indices of features, not their IDs) or featureIDs """ if type(features[0]) == type('') : features = self.featureNames2IDs(features) self.eliminateFeatures(misc.setminus(range(self.numFeatures), features))
def keepFeatures(self, features): """eliminate all but the give list of features INPUT: features - a list of features to eliminate; these are either numbers between 0 and numFeatures-1 (indices of features, not their IDs) or featureIDs """ if type(features[0]) == type(''): features = self.featureNames2IDs(features) self.eliminateFeatures(misc.setminus(range(self.numFeatures), features))
def selectFeatures(self, data, targetClass=None, otherClass = None, *options, **args) : s = self.featureScore.score(data, targetClass, otherClass, **args) if self.mode == "byNum" : featuresToEliminate = numpy.argsort(s)\ [:data.numFeatures - self.numFeatures] elif self.mode == "byThreshold" : featuresToEliminate = numpy.nonzero(numpy.less(s, self.threshold))[0] elif self.mode == "bySignificance" : t = self.significanceThreshold(data) self.thresholds = t featuresToEliminate = numpy.nonzero(numpy.less(s, t))[0] else : raise ValueError, 'unknown elimination mode in filter' return misc.setminus(range(data.numFeatures), featuresToEliminate)
def eliminateFeatures(self, featureList): """eliminate a list of features from a dataset Input: featureList - a list of features to eliminate; these are numbers between 0 and numFeatures-1 (indices of features, not their IDs)""" #if len(featureList) == 0 : return #if type(featureList[0]) == type('') : # featureList = self.featureNames2IDs(features) featuresToTake = misc.setminus(range(self.numFeatures), featureList) featuresToTake.sort() self.featureID = [self.featureID[i] for i in featuresToTake] #self.featureKey = [self.featureKey[i] for i in featuresToTake] #self.featureKeyDict = {} #for i in range(len(self.featureKey)) : # self.featureKeyDict[self.featureKey[i]] = i self.X = numpy.take(self.X, featuresToTake, 1)
def eliminateFeatures(self, featureList) : """eliminate a list of features from a dataset Input: featureList - a list of features to eliminate; these are numbers between 0 and numFeatures-1 (indices of features, not their IDs)""" if len(featureList) == 0 : return if type(featureList[0]) == type('') : featureList = self.featureNames2IDs(features) featuresToTake = misc.setminus(range(self.numFeatures), featureList) featuresToTake.sort() self.featureID = [self.featureID[i] for i in featuresToTake] self.featureKey = [self.featureKey[i] for i in featuresToTake] self.featureKeyDict = {} for i in range(len(self.featureKey)) : self.featureKeyDict[self.featureKey[i]] = i self.X = numpy.take(self.X, featuresToTake, 1)
def cv(classifier, data, numFolds=5, **args): """perform k-fold cross validation :Parameters: - `classifier` - a classifier template - `data` - a dataset - `numFolds` - number of cross validation folds (default = 5) :Returns: a Results object. :Keywords: - `numFolds` - number of cross validation folds (default = 5) - `seed` - random number generator seed - `foldsToPerform` - number of folds to actually perform (in case you're doing n fold CV, and want to save time, and only do some of the folds) """ if 'numFolds' in args: numFolds = args['numFolds'] if 'seed' in args: random.seed(args['seed']) foldsToPerform = numFolds if 'foldsToPerform' in args: foldsToPerform = args['foldsToPerform'] if foldsToPerform > numFolds: raise ValueError, 'foldsToPerform > numFolds' perm = range(len(data)) random.shuffle(perm) foldSize = len(data) / numFolds trainingPatterns = [] testingPatterns = [] for fold in range(foldsToPerform): if fold < numFolds - 1: testingPatterns.append(perm[foldSize * fold:foldSize * (fold + 1)]) else: testingPatterns.append(perm[foldSize * fold:len(data)]) trainingPatterns.append( misc.setminus(range(len(data)), testingPatterns[-1])) return cvFromFolds(classifier, data, trainingPatterns, testingPatterns, **args)
def cv(classifier, data, numFolds = 5, **args) : """perform k-fold cross validation :Parameters: - `classifier` - a classifier template - `data` - a dataset - `numFolds` - number of cross validation folds (default = 5) :Returns: a Results object. :Keywords: - `numFolds` - number of cross validation folds (default = 5) - `seed` - random number generator seed - `foldsToPerform` - number of folds to actually perform (in case you're doing n fold CV, and want to save time, and only do some of the folds) """ if 'numFolds' in args : numFolds = args['numFolds'] if 'seed' in args : random.seed(args['seed']) foldsToPerform = numFolds if 'foldsToPerform' in args : foldsToPerform = args['foldsToPerform'] if foldsToPerform > numFolds : raise ValueError, 'foldsToPerform > numFolds' perm = range(len(data)) random.shuffle(perm) foldSize = len(data) / numFolds trainingPatterns = [] testingPatterns = [] for fold in range(foldsToPerform) : if fold < numFolds-1: testingPatterns.append(perm[foldSize * fold : foldSize * (fold + 1)]) else: testingPatterns.append(perm[foldSize * fold : len(data)]) trainingPatterns.append(misc.setminus(range(len(data)), testingPatterns[-1])) return cvFromFolds(classifier, data, trainingPatterns, testingPatterns, **args)
def loo(classifier, data, **args) : """perform Leave One Out :Returns: a results object USAGE: loo(classifier, data) """ looResults = classifier.resultsObject() args['stats'] = False for i in range(len(data)) : trainingPatterns = misc.setminus(range(len(data)), [i]) looResults.extend( classifier.trainTest(data, trainingPatterns, [i], **args)) looResults.computeStats() return looResults
def loo(classifier, data, **args): """perform Leave One Out :Returns: a results object USAGE: loo(classifier, data) """ looResults = classifier.resultsObject() args['stats'] = False for i in range(len(data)): trainingPatterns = misc.setminus(range(len(data)), [i]) looResults.extend( classifier.trainTest(data, trainingPatterns, [i], **args)) looResults.computeStats() return looResults
def selectFeatures(self, data, targetClass=None, otherClass=None, *options, **args): s = self.featureScore.score(data, targetClass, otherClass, **args) if self.mode == "byNum": featuresToEliminate = numpy.argsort(s)\ [:data.numFeatures - self.numFeatures] elif self.mode == "byThreshold": featuresToEliminate = numpy.nonzero(numpy.less(s, self.threshold))[0] elif self.mode == "bySignificance": t = self.significanceThreshold(data) self.thresholds = t featuresToEliminate = numpy.nonzero(numpy.less(s, t))[0] else: raise ValueError, 'unknown elimination mode in filter' return misc.setminus(range(data.numFeatures), featuresToEliminate)
def split(data, fraction, **args): """ split a dataset into training and test sets. randomly splits a dataset into two datasets whose sizes are determined by the 'fraction' parameter (the first dataset will contain that fraction of the examples). for example: train, test = split(data, 0.7) will split the data -- 70% for training and 30% for test :Parameters: - `data` - a dataset object - `fraction` - the fraction of the examples to put in the first split :Keywords: - `stratified` - whether to perform stratified splitting, i.e. whether to keep the class ratio in the two datasets [default: True] - `seed` - random number generator seed - `indicesOnly` - if this flag is set, the indices of the two splits are returned instead of the datasets [default: False] """ if 'seed' in args: seed = args['seed'] rand = random.Random(seed) else: rand = random.Random() indicesOnly = False if 'indicesOnly' in args: indicesOnly = args['indicesOnly'] if data.__class__.__name__ == 'Labels': labels = data else: labels = data.labels sampleSize = int(len(data) * fraction) stratified = True if 'stratified' in args: stratified = args['stratified'] if stratified: patterns = [] for i in range(labels.numClasses): if i < labels.numClasses - 1: numToSample = int(fraction * labels.classSize[i]) else: numToSample = sampleSize - len(patterns) I = labels.classes[i][:] rand.shuffle(I) patterns.extend(I[:numToSample]) else: I = range(len(data)) rand.shuffle(I) patterns = I[:sampleSize] patterns.sort() if not indicesOnly: return (data.__class__(data, patterns=patterns), data.__class__(data, patterns=misc.setminus(range(len(data)), patterns))) else: return patterns, misc.setminus(range(len(data)), patterns)
def split(data, fraction, **args) : """ split a dataset into training and test sets. randomly splits a dataset into two datasets whose sizes are determined by the 'fraction' parameter (the first dataset will contain that fraction of the examples). for example: train, test = split(data, 0.7) will split the data -- 70% for training and 30% for test :Parameters: - `data` - a dataset object - `fraction` - the fraction of the examples to put in the first split :Keywords: - `stratified` - whether to perform stratified splitting, i.e. whether to keep the class ratio in the two datasets [default: True] - `seed` - random number generator seed - `indicesOnly` - if this flag is set, the indices of the two splits are returned instead of the datasets [default: False] """ if 'seed' in args : seed = args['seed'] rand = random.Random(seed) else : rand = random.Random() indicesOnly = False if 'indicesOnly' in args : indicesOnly = args['indicesOnly'] if data.__class__.__name__ == 'Labels' : labels = data else : labels = data.labels sampleSize = int(len(data) * fraction) stratified = True if 'stratified' in args : stratified = args['stratified'] if stratified : patterns = [] for i in range(labels.numClasses) : if i < labels.numClasses - 1 : numToSample = int(fraction * labels.classSize[i]) else : numToSample = sampleSize - len(patterns) I = labels.classes[i][:] rand.shuffle(I) patterns.extend(I[:numToSample]) else : I = range(len(data)) rand.shuffle(I) patterns = I[:sampleSize] patterns.sort() if not indicesOnly : return (data.__class__(data, patterns = patterns), data.__class__(data, patterns = misc.setminus(range(len(data)), patterns) ) ) else : return patterns, misc.setminus(range(len(data)), patterns)