def __init__(self, data) : if type(data) == type('') : print 'file name:', data data = datafunc.PyVectorDataSet(data, idColumn = 0, headerRow = True, hint = 'csv') self.data = data self.idDict = misc.list2dict(data.labels.patternID, range(len(data))) print numpy.shape(data.X) self.mean = numpy.mean(data.X, 1) self.std = std(data.X, 1) eps = 1e-5 I = numpy.nonzero(numpy.less(self.std, eps))[0] print 'num zeros:',len(I) numpy.put(self.std, I, 1) self.numCorrelations = 10000 correlations = numpy.zeros(self.numCorrelations, numpy.float) for i in range(self.numCorrelations) : i1 = random.randrange(0, len(data)) i2 = random.randrange(0, len(data)) correlations[i] = self._corrcoef(i1, i2) self.meanCorrelation = numpy.mean(correlations) self.numCorrelations = 1000
def sortKernel2(kernelInFile, kernelOutFile, ids, format = 'gist', **args) : """ sort a kernel matrix according to the given list of ids :Parameters: - `kernelInFile` - the kernel input file name - `kernelOutFile` - the output file name - `format` - whether to output the kernel in gist format :Keywords: - `delim` - the field delimiter (default = tab) """ from PyML.containers import KernelData kdata = KernelData(kernelInFile) K = kdata.getKernelMatrix() idDict = misc.list2dict(ids, range(len(ids))) delim = '\t' if 'delim' in args : delim = args['delim'] kernelFile = open(kernelOutFile, 'w') if format == 'gist' : kernelFile.write(kernelOutFile + delim + delim.join(ids) + '\n') for id1 in ids : kernelFile.write(id1 + delim) tokens = [str(K[idDict[id1]][idDict[id2]]) for id2 in ids] kernelFile.write(delim.join(tokens) + '\n')
def __init__(self, data): if type(data) == type(''): print 'file name:', data data = datafunc.PyVectorDataSet(data, idColumn=0, headerRow=True, hint='csv') self.data = data self.idDict = misc.list2dict(data.labels.patternID, range(len(data))) print numpy.shape(data.X) self.mean = numpy.mean(data.X, 1) self.std = std(data.X, 1) eps = 1e-5 I = numpy.nonzero(numpy.less(self.std, eps))[0] print 'num zeros:', len(I) numpy.put(self.std, I, 1) self.numCorrelations = 10000 correlations = numpy.zeros(self.numCorrelations, numpy.float) for i in range(self.numCorrelations): i1 = random.randrange(0, len(data)) i2 = random.randrange(0, len(data)) correlations[i] = self._corrcoef(i1, i2) self.meanCorrelation = numpy.mean(correlations) self.numCorrelations = 1000
def constructFromFile(self, fileName): patternIDdict = misc.list2dict(self._data.labels.patternID, range(len(self._data))) labels = Labels(fileName) patterns = [] pairs = [] for i in range(len(labels)): p1, p2 = labels.patternID[i].split('_') # add only pairs for which we have kernel data: if p1 in patternIDdict and p2 in patternIDdict: pairs.append((patternIDdict[p1], patternIDdict[p2])) patterns.append(i) else: print p1, ' or ', p2, 'not found' labels = labels.__class__(labels, patterns=patterns) self.pairs = pairs first = [pair[0] for pair in pairs] second = [pair[1] for pair in pairs] firstVector = arrayWrap.intVector([pair[0] for pair in pairs]) secondVector = arrayWrap.intVector([pair[1] for pair in pairs]) self.callConstructor(firstVector, secondVector) WrapperDataSet.attachLabels(self, labels)
def copyConstruct(self, other, **args) : if not hasattr(other, 'decisionFunc') : raise AttributeError, 'not a valid results object' if 'patterns' in args : p = args['patterns'] idDict = misc.list2dict(other.patternID, range(len(other.patternID))) patterns = [idDict[pattern] for pattern in p if pattern in idDict] else : patterns = range(len(other.Y)) self.patternID = [other.patternID[p] for p in patterns] self.L = [other.L[p] for p in patterns] self.Y = [other.Y[p] for p in patterns] self.decisionFunc = [other.decisionFunc[p] for p in patterns] self.givenY = [other.givenY[p] for p in patterns] self.givenL = [other.givenL[p] for p in patterns] self.rocN = 50 self.classLabels = copy.deepcopy(other.classLabels) self.numClasses = len(self.classLabels) self.info = other.info try : self.log = other.log except : pass self.computeStats()
def constructFromFile(self, fileName, **args) : if 'data' not in args : raise ValueError, 'missing data object' self._data = args['data'] patternIDdict = misc.list2dict(self._data.labels.patternID, range(len(self._data))) labels = Labels(fileName) patterns = [] pairs = [] for i in range(len(labels)) : p1,p2 = labels.patternID[i].split('_') # add only pairs for which we have kernel data: if p1 in patternIDdict and p2 in patternIDdict : pairs.append((patternIDdict[p1],patternIDdict[p2])) patterns.append(i) else : print p1, ' or ', p2, 'not found' labels = labels.__class__(labels, patterns = patterns) self.pairs = pairs first = [pair[0] for pair in pairs] second = [pair[1] for pair in pairs] firstVector = arrayWrap.intVector([pair[0] for pair in pairs]) secondVector = arrayWrap.intVector([pair[1] for pair in pairs]) self.callConstructor(firstVector, secondVector) WrapperDataSet.attachLabels(self, labels)
def sortKernel(kernelInFile, kernelOutFile, format = 'gist', **args) : """ sort a kernel matrix according to its pattern ID :Parameters: - `kernelInFile` - the kernel input file name - `kernelOutFile` - the output file name - `format` - whether to output the kernel in gist format :Keywords: - `delim` - the field delimiter (default = tab) """ from PyML.containers import KernelData kdata = KernelData(kernelInFile) idDict = misc.list2dict(kdata.labels.patternID, range(len(kdata))) ids = kdata.labels.patternID[:] ids.sort() delim = '\t' if 'delim' in args : delim = args['delim'] kernelFile = open(kernelOutFile, 'w') if format == 'gist' : kernelFile.write(kernelOutFile + delim + delim.join(ids) + '\n') for id1 in ids : kernelFile.write(id1 + delim) tokens = [str(kdata.kernel.eval(kdata, idDict[id1], idDict[id2])) for id2 in ids] kernelFile.write(delim.join(tokens) + '\n')
def constructFromFile(self, fileName) : delim = ',' if self.data is not None : patternIDdict = misc.list2dict(self.data.labels.patternID, range(len(self.data))) else : patternIDdict = {} L = [] patternID = [] pairs = [] file = open(fileName) for line in file : tokens = line[:-1].split(delim) #patternID.append(tokens[0]) p1,p2 = tokens[0].split('_') if p1 > p2 : p1,p2 = p2,p1 # add only pairs for which we have kernel data: if p1 in patternIDdict and p2 in patternIDdict or self.data is None : pairs.append((p1,p2)) L.append(tokens[1]) patternID.append('_'.join([p1,p2])) else : print p1, ' or ', p2, 'not found' self.pairs = pairs self.labels = Labels(L, patternID = patternID)
def constructFromFile(self, fileName): patternIDdict = misc.list2dict(self._data.labels.patternID, range(len(self._data))) labels = Labels(fileName) patterns = [] pairs = [] for i in range(len(labels)): p1, p2 = labels.patternID[i].split("_") # add only pairs for which we have kernel data: if p1 in patternIDdict and p2 in patternIDdict: pairs.append((patternIDdict[p1], patternIDdict[p2])) patterns.append(i) else: print p1, " or ", p2, "not found" labels = labels.__class__(labels, patterns=patterns) self.pairs = pairs first = [pair[0] for pair in pairs] second = [pair[1] for pair in pairs] firstVector = arrayWrap.intVector([pair[0] for pair in pairs]) secondVector = arrayWrap.intVector([pair[1] for pair in pairs]) self.callConstructor(firstVector, secondVector) WrapperDataSet.attachLabels(self, labels)
def constructFromFile(self, fileName): delim = ',' if self.data is not None: patternIDdict = misc.list2dict(self.data.labels.patternID, range(len(self.data))) else: patternIDdict = {} L = [] patternID = [] pairs = [] file = open(fileName) for line in file: tokens = line[:-1].split(delim) #patternID.append(tokens[0]) p1, p2 = tokens[0].split('_') if p1 > p2: p1, p2 = p2, p1 # add only pairs for which we have kernel data: if p1 in patternIDdict and p2 in patternIDdict or self.data is None: pairs.append((p1, p2)) L.append(tokens[1]) patternID.append('_'.join([p1, p2])) else: print p1, ' or ', p2, 'not found' self.pairs = pairs self.labels = Labels(L, patternID=patternID)
def copyConstruct(self, other, **args): forgetClassLabels = False if "patterns" in args: patterns = args['patterns'] # if the patterns are ids (strings) convert them to indices: if type(patterns[0]) == type(''): idDict = misc.list2dict(patterns) patternsToCopy = [ i for i in range(len(other)) if other.labels.patternID[i] in idDict ] else: patternsToCopy = patterns elif "classes" in args: patternsToCopy = [ i for i in range(len(other)) if other.labels.L[i] in args["classes"] ] forgetClassLabels = True elif "classID" in args: patternsToCopy = [ i for i in range(len(other)) if other.labels.Y[i] in args["classID"] ] forgetClassLabels = True else: patternsToCopy = range(len(other)) self.setTrainingFunc(other.trainingFunc) self.setTestingFunc(other.testingFunc) # class dependent copying of data: self.copy(other, patternsToCopy) self.attachKernel(other) self.attachLabels( Labels(other.labels, patterns=patternsToCopy, forgetClassLabels=forgetClassLabels)) # copy the registered attribute: if hasattr(other, '_registeredAttributes'): self._registeredAttributes = other._registeredAttributes[:] self._actions = copy.deepcopy(other._actions) for attr in self._registeredAttributes: a = getattr(other, attr) if type(a) == type([]): if len(a) != len(other): raise ValueError, 'attribute has bad length' #BaseDataSet.__setattr__(self, attr, # [a[i] for i in patternsToCopy]) setattr(self, attr, [a[i] for i in patternsToCopy]) elif hasattr(a, 'type') and a.type == 'dataset' and len( a) == len(self): acopy = a.__class__(a, patterns=patternsToCopy) setattr(self, attr, acopy) else: setattr(self, attr, a)
def copyConstruct(self, other, **args) : forgetClassLabels = False if "patterns" in args: patterns = args['patterns'] # if the patterns are ids (strings) convert them to indices: if type(patterns[0]) == type('') : idDict = misc.list2dict(patterns) patternsToCopy = [i for i in range(len(other)) if other.labels.patternID[i] in idDict] else : patternsToCopy = patterns elif "classes" in args : patternsToCopy = [i for i in range(len(other)) if other.labels.L[i] in args["classes"]] forgetClassLabels = True elif "classID" in args : patternsToCopy = [i for i in range(len(other)) if other.labels.Y[i] in args["classID"]] forgetClassLabels = True else : patternsToCopy = range(len(other)) self.setTrainingFunc(other.trainingFunc) self.setTestingFunc(other.testingFunc) deepcopy = True if 'deepcopy' in args : deepcopy = args['deepcopy'] # class dependent copying of data: self.copy(other, patternsToCopy, deepcopy) self.attachKernel(other) self.attachLabels(Labels(other.labels, patterns = patternsToCopy, forgetClassLabels = forgetClassLabels)) # copy the registered attribute: if hasattr(other, '_registeredAttributes') : self._registeredAttributes = other._registeredAttributes[:] self._actions = copy.deepcopy(other._actions) for attr in self._registeredAttributes : a = getattr(other, attr) if type(a) == type([]) : if len(a) != len(other) : raise ValueError, 'attribute has bad length' #BaseDataSet.__setattr__(self, attr, # [a[i] for i in patternsToCopy]) setattr(self, attr, [a[i] for i in patternsToCopy]) elif hasattr(a, 'type') and a.type == 'dataset' and len(a) == len(self) : acopy = a.__class__(a, patterns = patternsToCopy) setattr(self, attr, acopy) else : setattr(self, attr, a)
def addFeature(self, id, values) : hashID = hash(id) if hashID in self.featureKeyDict : raise ValueError, 'feature already exists, or hash problem' for i in range(len(self)) : if values[i] != 0 : self.X[i][hashID] = values[i] # update the featureKey, featureID attributes: pos = numpy.searchsorted(self.featureKey, hashID) self.featureKey.insert(pos, hashID) self.featureID.insert(pos, id) self.featureKeyDict = misc.list2dict(self.featureKey, range(len(self.featureKey)))
def addFeature(self, id, values): hashID = hash(id) if hashID in self.featureKeyDict: raise ValueError, 'feature already exists, or hash problem' for i in range(len(self)): if values[i] != 0: self.X[i][hashID] = values[i] # update the featureKey, featureID attributes: pos = numpy.searchsorted(self.featureKey, hashID) self.featureKey.insert(pos, hashID) self.featureID.insert(pos, id) self.featureKeyDict = misc.list2dict(self.featureKey, range(len(self.featureKey)))
def expandKernel(inKernelFile, referenceKernelFile, outKernelFile, **args) : """ Given a kernel matrix that might have missing entries, fill those as 0 on the basis of the patterns in a reference kernel (it is checked that the reference kernel is sorted). :Parameters: - `inKernelFile` - input kernel file name - `referenceKernelFile` - file name for the reference kernel - `outKernelFile` - file name to output expanded kernel """ if 'format' in args : format = args['format'] else : format = 'gist' delim = '\t' from datafunc import KernelData import misc import numpy inKernel = KernelData(inKernelFile) refKernel = KernelData(referenceKernelFile) print 'loaded data' ids = refKernel.labels.patternID[:] ids.sort() if ids != refKernel.labels.patternID : raise ValueError, 'reference kernel not sorted' idDict = misc.list2dict(inKernel.labels.patternID) outKernel = open(outKernelFile, 'w') if format == 'gist' : outKernel.write(outKernelFile + delim) outKernel.write(delim.join(ids) + '\n') for i in range(len(refKernel)) : outKernel.write(id1 + delim) for j in range(len(refKernel)) : values = numpy.zeros(len(refKernel), numpy.float_) if ids[i] in idDict and ids[j] in idDict : values[j] = inKernel.kernel.eval(inKernel, idDict[ids[i]],idDict[ids[j]]) tokens = [str(value) for value in values] outKernel.write(delim.join(tokens) + '\n')
def constructFromFile(self, file_name, **args) : if 'data' not in args : raise ValueError, 'missing data object' self._data = args['data'] id_dict = misc.list2dict(self._data.labels.patternID, range(len(self._data))) file_handle = open(file_name) L = [] sets = [] for line in file_handle : tokens = line.split() sets.append([id_dict[token] for token in tokens[:-1] ]) L.append(tokens[-1]) self.n = len(sets) self.callConstructor(len(sets)) for s in sets : self.add(tuple(s)) labels = Labels(L) WrapperDataSet.attachLabels(self, labels)
def updateFeatureDict(self, arg = None) : if arg.__class__ == self.__class__ : # features were extended with those in another dataset other = arg self.featureID.extend(other.featureID) elif type(arg) == list : #features were eliminated: eliminated = misc.list2dict(arg) self.featureID = [self.featureID[i] for i in range(len(self.featureID)) if i not in eliminated] elif type(arg) == type(1) or type(arg) == type('') : # a feature was added id = arg self.featureID.append(id) self.featureDict[id] = self.numFeatures - 1 return self.featureDict = {} for i in range(len(self.featureID)) : self.featureDict[self.featureID[i]] = i
def updateFeatureDict(self, arg = None) : if arg.__class__ == self.__class__ : other = arg self.featureID.extend(other.featureID) self.featureID.sort(cmp = lambda x,y : cmp(hash(x), hash(y))) elif type(arg) == list : #features were eliminated: eliminated = misc.list2dict(arg) self.featureID = [self.featureID[i] for i in range(len(self.featureID)) if i not in eliminated] elif type(arg) == type(1) or type(arg) == type('') : # a feature was added: id = arg self.featureID.append(id) self.featureID.sort(cmp = lambda x,y : cmp(hash(x), hash(y))) self.featureDict = {} self.featureKeyDict = {} for i in range(len(self.featureID)) : self.featureDict[self.featureID[i]] = i self.featureKeyDict[hash(self.featureID[i])] = i
def score2(self, data, **args): featuresPerForest = int(math.ceil( float(self.maxSize) / len(data))) - 10 numForests = int(math.ceil( float(data.numFeatures) / featuresPerForest)) perm = range(data.numFeatures) random.shuffle(perm) scores = numpy.zeros(data.numFeatures, numpy.Float) pvals = numpy.zeros(data.numFeatures, numpy.Float) featureIDdict = misc.list2dict(data.featureID, range(data.numFeatures)) print 'numForests', numForests, featuresPerForest, data.numFeatures for i in range(numForests): print 'forest number', i + 1 if i < numForests - 1: features = perm[featuresPerForest * i:featuresPerForest * (i + 1)] else: features = perm[featuresPerForest * i:] subdata = data.__class__(data, 'deepcopy') subdata.keepFeatures(features) subscores = self.score(subdata, **args) for j in range(subdata.numFeatures): scores[featureIDdict[subdata.featureID[j]]] = subscores[j] pvals[featureIDdict[subdata.featureID[j]]] = self.pvals[j] # re-rank all the best features together: bestFeatures = numpy.argsort(scores)[:featuresPerForest] print featuresPerForest print 'length of best Features', len(bestFeatures) subdata = data.__class__(data, 'deepcopy') subdata.keepFeatures(bestFeatures) subscores = self.score(subdata, **args) for j in range(subdata.numFeatures): scores[featureIDdict[subdata.featureID[j]]] = subscores[j] pvals[featureIDdict[subdata.featureID[j]]] = self.pvals[j] self.pvals = pvals return scores
def updateFeatureDict(self, arg=None): if arg.__class__ == self.__class__: # features were extended with those in another dataset other = arg self.featureID.extend(other.featureID) elif type(arg) == list: #features were eliminated: eliminated = misc.list2dict(arg) self.featureID = [ self.featureID[i] for i in range(len(self.featureID)) if i not in eliminated ] elif type(arg) == type(1) or type(arg) == type(''): # a feature was added id = arg self.featureID.append(id) self.featureDict[id] = self.numFeatures - 1 return self.featureDict = {} for i in range(len(self.featureID)): self.featureDict[self.featureID[i]] = i
def score2(self, data, **args) : featuresPerForest = int(math.ceil(float(self.maxSize) / len(data))) - 10 numForests = int(math.ceil(float(data.numFeatures) / featuresPerForest)) perm = range(data.numFeatures) random.shuffle(perm) scores = numpy.zeros(data.numFeatures, numpy.Float) pvals = numpy.zeros(data.numFeatures, numpy.Float) featureIDdict = misc.list2dict(data.featureID, range(data.numFeatures)) print 'numForests', numForests, featuresPerForest, data.numFeatures for i in range(numForests) : print 'forest number', i + 1 if i < numForests - 1 : features = perm[featuresPerForest * i : featuresPerForest * (i + 1)] else : features = perm[featuresPerForest * i : ] subdata = data.__class__(data, 'deepcopy') subdata.keepFeatures(features) subscores = self.score(subdata, **args) for j in range(subdata.numFeatures) : scores[featureIDdict[subdata.featureID[j]]] = subscores[j] pvals[featureIDdict[subdata.featureID[j]]] = self.pvals[j] # re-rank all the best features together: bestFeatures = numpy.argsort(scores)[:featuresPerForest] print featuresPerForest print 'length of best Features', len(bestFeatures) subdata = data.__class__(data, 'deepcopy') subdata.keepFeatures(bestFeatures) subscores = self.score(subdata, **args) for j in range(subdata.numFeatures) : scores[featureIDdict[subdata.featureID[j]]] = subscores[j] pvals[featureIDdict[subdata.featureID[j]]] = self.pvals[j] self.pvals = pvals return scores
def commonKernel(kernelFile1, kernelFile2, kernelOutFileName1, kernelOutFileName2) : delim = ' ' from datafunc import KernelData import misc kdata1 = KernelData(kernelFile1) kdata2 = KernelData(kernelFile2) print 'loaded data' ids = misc.intersect(kdata1.labels.patternID, kdata2.labels.patternID) ids.sort() idDict1 = misc.list2dict(ids) if len(ids) != len(kdata1) : kernelOutFile1 = open(kernelOutFileName1, 'w') idDict = {} for i in range(len(kdata1)) : if kdata1.labels.patternID[i] in idDict1 : idDict[kdata1.labels.patternID[i]] = i for id1 in ids : print id1 kernelOutFile1.write(id1 + delim) tokens = [str(kdata1.kernel.eval(kdata1, idDict[id1], idDict[id2])) for id2 in ids] kernelOutFile1.write(delim.join(tokens) + '\n') if len(ids) != len(kdata2) : kernelOutFile2 = open(kernelOutFileName2, 'w') idDict = {} for i in range(len(kdata2)) : if kdata2.labels.patternID[i] in idDict1 : idDict[kdata2.labels.patternID[i]] = i for id1 in ids : print id1 kernelOutFile2.write(id1 + delim) tokens = [str(kdata2.kernel.eval(kdata2, idDict[id1], idDict[id2])) for id2 in ids] kernelOutFile2.write(delim.join(tokens) + '\n')
def updateFeatureDict(self, arg=None): if arg.__class__ == self.__class__: other = arg self.featureID.extend(other.featureID) self.featureID.sort(cmp=lambda x, y: cmp(hash(x), hash(y))) elif type(arg) == list: #features were eliminated: eliminated = misc.list2dict(arg) self.featureID = [ self.featureID[i] for i in range(len(self.featureID)) if i not in eliminated ] elif type(arg) == type(1) or type(arg) == type(''): # a feature was added: id = arg self.featureID.append(id) self.featureID.sort(cmp=lambda x, y: cmp(hash(x), hash(y))) self.featureDict = {} self.featureKeyDict = {} for i in range(len(self.featureID)): self.featureDict[self.featureID[i]] = i self.featureKeyDict[hash(self.featureID[i])] = i
def stratifiedCV(classifier, data, numFolds = 5, **args) : """perform k-fold stratified cross-validation; in each fold the number of patterns from each class is proportional to the relative fraction of the class in the dataset :Parameters: - `classifier` - a classifier template - `data` - a dataset - `numFolds` - number of cross validation folds (default = 5) :Returns: a Results object. :Keywords: - `numFolds` - number of cross-validation folds -- overrides the numFolds parameter - `seed` - random number generator seed - `trainingAllFolds` - a list of patterns that are to be used as training examples in all CV folds. - `intermediateFile` - a file name to save intermediate results under if this argument is not given, not intermediate results are saved - `foldsToPerform` - number of folds to actually perform (in case you're doing n fold CV, and want to save time, and only do some of the folds) """ if 'numFolds' in args : numFolds = args['numFolds'] if 'seed' in args : random.seed(args['seed']) if 'trainingAllFolds' in args : trainingAllFolds = args['trainingAllFolds'] else : trainingAllFolds = [] foldsToPerform = numFolds if 'foldsToPerform' in args : foldsToPerform = args['foldsToPerform'] if foldsToPerform > numFolds : raise ValueError, 'foldsToPerform > numFolds' trainingAllFoldsDict = misc.list2dict(trainingAllFolds) labels = data.labels p = [[] for i in range(labels.numClasses)] classFoldSize = [int(labels.classSize[k] / numFolds) for k in range(labels.numClasses)] for i in range(len(data)): if i not in trainingAllFoldsDict : p[labels.Y[i]].append(i) for k in range(labels.numClasses): random.shuffle(p[k]) trainingPatterns = [[] for i in range(foldsToPerform)] testingPatterns = [[] for i in range(foldsToPerform)] for fold in range(foldsToPerform) : for k in range(labels.numClasses) : classFoldStart = classFoldSize[k] * fold if fold < numFolds-1: classFoldEnd = classFoldSize[k] * (fold + 1) else: classFoldEnd = labels.classSize[k] testingPatterns[fold].extend(p[k][classFoldStart:classFoldEnd]) if fold > 0: trainingPatterns[fold].extend(p[k][0:classFoldStart] + p[k][classFoldEnd:labels.classSize[k]]) else: trainingPatterns[fold].extend(p[k][classFoldEnd:labels.classSize[k]]) if len(trainingPatterns) > 0 : for fold in range(len(trainingPatterns)) : trainingPatterns[fold].extend(trainingAllFolds) return cvFromFolds(classifier, data, trainingPatterns, testingPatterns, **args)
def save(self, fileName, **args): """save a dataset to a file (does not use pickle!) :Parameters: - `fileName` - a file name or a file handle :Keywords: - `format` - 'csv' or 'sparse'; by default format is chosen by the type of the dataset -- sparse containers save in sparse format and non-sparse containers in csv format. - `delimiter` - which delimiter to use when saving in csv format - `patterns` - save only those patterns whose indices are given - `ids` - save only those patterns whose pattern ID are given - `sortByID` - whether to sort the lines according to the pattern ID (default = False) - `sortByLabel` - whether to sort the lines according to the class label (default = False) """ print 'saving to ', fileName if type(fileName) == type(''): fileHandle = open(fileName, 'w') else: fileHandle = fileName L = self.labels.L if self.__class__.__name__.lower().find('sparse') >= 0: format = 'sparse' else: format = 'csv' print 'detected file format as:', format if 'format' in args: format = args['format'] if 'delimiter' in args: delim = args['delimiter'] else: delim = ',' if 'patterns' in args: patterns = args['patterns'] else: patterns = range(len(self)) if 'ids' in args: idDict = misc.list2dict(args['ids']) patterns = [ i for i in range(len(self)) if self.labels.patternID[i] in idDict ] if 'sortByID' in args and args['sortByID']: ids = self.labels.patternID[:] ids.sort() idMap = misc.list2dict(self.labels.patternID, range(len(self))) idDict = misc.list2dict(patterns) patterns = [idMap[id] for id in ids if idMap[id] in idDict] if 'sortByLabel' in args and args['sortByLabel']: y = self.labels.Y[:] patterns = numpy.argsort(self.labels.Y) if format == 'csv': if L is None: labels = '' else: labels = 'labels' + delim fileHandle.write('#' + 'patternID' + delim + labels + delim.join(self.featureID) + '\n') for i in patterns: x = self.getPattern(i) if format == 'sparse': if self.labels.patternID is not None: fileHandle.write(str(self.labels.patternID[i]) + ',') if L is not None: if type(L[i]) == type([]): fileHandle.write(';'.join(L[i]) + ' ') else: fileHandle.write(str(L[i]) + ' ') if type(x) == type({}): tokens = [ self.featureID[self.featureKeyDict[key]] + ':' + str(x[key]) for key in x ] else: tokens = [ self.featureID[i] + ':' + str(x[i]) for i in range(self.numFeatures) if x[i] != 0 ] fileHandle.write(' '.join(tokens) + '\n') else: if self.labels.patternID is not None: fileHandle.write(str(self.labels.patternID[i]) + delim) if L is not None: if type(L[i]) == type([]): fileHandle.write(';'.join(L[i]) + delim) else: fileHandle.write(L[i] + delim) if type(x) == type({}): tokens = [ str(x.get(self.featureKey[i], 0)) for i in range(self.numFeatures) ] else: tokens = [str(val) for val in x] fileHandle.write(delim.join(tokens) + '\n') fileHandle.close()
def stratifiedCV(classifier, data, numFolds=5, **args): """perform k-fold stratified cross-validation; in each fold the number of patterns from each class is proportional to the relative fraction of the class in the dataset :Parameters: - `classifier` - a classifier template - `data` - a dataset - `numFolds` - number of cross validation folds (default = 5) :Returns: a Results object. :Keywords: - `numFolds` - number of cross-validation folds -- overrides the numFolds parameter - `seed` - random number generator seed - `trainingAllFolds` - a list of patterns that are to be used as training examples in all CV folds. - `intermediateFile` - a file name to save intermediate results under if this argument is not given, not intermediate results are saved - `foldsToPerform` - number of folds to actually perform (in case you're doing n fold CV, and want to save time, and only do some of the folds) """ if 'numFolds' in args: numFolds = args['numFolds'] if 'seed' in args: random.seed(args['seed']) if 'trainingAllFolds' in args: trainingAllFolds = args['trainingAllFolds'] else: trainingAllFolds = [] foldsToPerform = numFolds if 'foldsToPerform' in args: foldsToPerform = args['foldsToPerform'] if foldsToPerform > numFolds: raise ValueError, 'foldsToPerform > numFolds' trainingAllFoldsDict = misc.list2dict(trainingAllFolds) labels = data.labels p = [[] for i in range(labels.numClasses)] classFoldSize = [ int(labels.classSize[k] / numFolds) for k in range(labels.numClasses) ] for i in range(len(data)): if i not in trainingAllFoldsDict: p[labels.Y[i]].append(i) for k in range(labels.numClasses): random.shuffle(p[k]) trainingPatterns = [[] for i in range(foldsToPerform)] testingPatterns = [[] for i in range(foldsToPerform)] for fold in range(foldsToPerform): for k in range(labels.numClasses): classFoldStart = classFoldSize[k] * fold if fold < numFolds - 1: classFoldEnd = classFoldSize[k] * (fold + 1) else: classFoldEnd = labels.classSize[k] testingPatterns[fold].extend(p[k][classFoldStart:classFoldEnd]) if fold > 0: trainingPatterns[fold].extend( p[k][0:classFoldStart] + p[k][classFoldEnd:labels.classSize[k]]) else: trainingPatterns[fold].extend( p[k][classFoldEnd:labels.classSize[k]]) if len(trainingPatterns) > 0: for fold in range(len(trainingPatterns)): trainingPatterns[fold].extend(trainingAllFolds) return cvFromFolds(classifier, data, trainingPatterns, testingPatterns, **args)
def save(self, fileName, **args) : """save a dataset to a file (does not use pickle!) :Parameters: - `fileName` - a file name or a file handle :Keywords: - `format` - 'csv' or 'sparse'; by default format is chosen by the type of the dataset -- sparse containers save in sparse format and non-sparse containers in csv format. - `delimiter` - which delimiter to use when saving in csv format - `patterns` - save only those patterns whose indices are given - `ids` - save only those patterns whose pattern ID are given - `sortByID` - whether to sort the lines according to the pattern ID (default = False) - `sortByLabel` - whether to sort the lines according to the class label (default = False) """ print 'saving to ', fileName if type(fileName) == type('') : fileHandle = open(fileName, 'w') else : fileHandle = fileName L = self.labels.L if self.__class__.__name__.lower().find('sparse') >= 0 : format = 'sparse' else : format = 'csv' print 'detected file format as:', format if 'format' in args : format = args['format'] if 'delimiter' in args : delim = args['delimiter'] else : delim = ',' if 'patterns' in args : patterns = args['patterns'] else : patterns = range(len(self)) if 'ids' in args : idDict = misc.list2dict(args['ids']) patterns = [i for i in range(len(self)) if self.labels.patternID[i] in idDict] if 'sortByID' in args and args['sortByID'] : ids = self.labels.patternID[:] ids.sort() idMap = misc.list2dict(self.labels.patternID, range(len(self))) idDict = misc.list2dict(patterns) patterns = [idMap[id] for id in ids if idMap[id] in idDict] if 'sortByLabel' in args and args['sortByLabel'] : y = self.labels.Y[:] patterns = numpy.argsort(self.labels.Y) if format == 'csv' : if L is None : labels = '' else : labels = 'labels' + delim fileHandle.write('#' + 'patternID' + delim + labels + delim.join(self.featureID) + '\n') for i in patterns : x = self.getPattern(i) if format == 'sparse' : if self.labels.patternID is not None : fileHandle.write(str(self.labels.patternID[i]) + ',') if L is not None : if type(L[i]) == type([]) : fileHandle.write(';'.join(L[i]) + ' ') else : fileHandle.write(str(L[i]) + ' ') if type(x) == type({}) : tokens = [self.featureID[self.featureKeyDict[key]]+':'+ str(x[key]) for key in x] else : tokens = [self.featureID[i] + ':' + str(x[i]) for i in range(self.numFeatures) if x[i] != 0] fileHandle.write(' '.join(tokens) + '\n') else : if self.labels.patternID is not None : fileHandle.write(str(self.labels.patternID[i]) + delim) if L is not None : if type(L[i]) == type([]) : fileHandle.write(';'.join(L[i]) + delim) else : fileHandle.write(L[i] + delim) if type(x) == type({}) : tokens = [str(x.get(self.featureKey[i],0)) for i in range(self.numFeatures)] else : tokens = [str(val) for val in x] fileHandle.write(delim.join(tokens) + '\n') fileHandle.close()