def scatter(r1, r2, statistic="roc", x1Label="", x2Label="", fileName=None, **args): """ a scatter plot for comparing the performance of two classifiers :Parameters: - `r1, r2` - both are either a list of Result classes, or a list of success rates / ROC scores - `statistic` - which measure of classifier success to plot values : 'roc', 'successRate', 'balancedSuccessRate' in order to specify parts of the roc curve you can use something like: 'roc50' or 'roc0.1' :Keywords: - `title` - the title of the plot """ if len(r1) != len(r2): print "unequal lengths for r1 and r2" if type(r1) != type({}): raise ValueError, "Cannot handle unequal length when it is not a dict" keys1 = r1.keys() keys2 = r2.keys() common = misc.intersect(keys1, keys2) r1new = {} r2new = {} for key in common: r1new[key] = r1[key] r2new[key] = r2[key] r1 = r1new r2 = r2new if type(r1) == type({}) and type(r2) == type({}): I = r1.keys() else: I = range(len(r1)) if r1[I[0]].__class__.__name__ == "Results" or r1[I[0]].__class__.__name__ == "Container": p1 = misc.extractAttribute(r1, statistic) p2 = misc.extractAttribute(r2, statistic) else: p1 = r1 p2 = r2 if type(p1) == type({}): p1 = p1.values() p2 = p2.values() from matplotlib import pylab x = numpy.arange(0, 1, 0.01) pylab.plot(p1, p2, "bo", x, x, "-k") pylab.xlabel(x1Label, fontsize=18) pylab.ylabel(x2Label, fontsize=18) if "title" in args: pylab.title(args["title"], fontsize=18) pylab.show() if fileName is not None: pylab.savefig(fileName) pylab.close()
def verifyData(self, data): """ verify that for a VectorDataSet the test examples refer to the same features used in training. """ if data.__class__.__name__ != 'VectorDataSet': return if len(misc.intersect(self.featureID, data.featureID)) != len( self.featureID): raise ValueError, 'missing features in test data'
def commonKernel(kernelFile1, kernelFile2, kernelOutFileName1, kernelOutFileName2) : delim = ' ' from datafunc import KernelData import misc kdata1 = KernelData(kernelFile1) kdata2 = KernelData(kernelFile2) print 'loaded data' ids = misc.intersect(kdata1.labels.patternID, kdata2.labels.patternID) ids.sort() idDict1 = misc.list2dict(ids) if len(ids) != len(kdata1) : kernelOutFile1 = open(kernelOutFileName1, 'w') idDict = {} for i in range(len(kdata1)) : if kdata1.labels.patternID[i] in idDict1 : idDict[kdata1.labels.patternID[i]] = i for id1 in ids : print id1 kernelOutFile1.write(id1 + delim) tokens = [str(kdata1.kernel.eval(kdata1, idDict[id1], idDict[id2])) for id2 in ids] kernelOutFile1.write(delim.join(tokens) + '\n') if len(ids) != len(kdata2) : kernelOutFile2 = open(kernelOutFileName2, 'w') idDict = {} for i in range(len(kdata2)) : if kdata2.labels.patternID[i] in idDict1 : idDict[kdata2.labels.patternID[i]] = i for id1 in ids : print id1 kernelOutFile2.write(id1 + delim) tokens = [str(kdata2.kernel.eval(kdata2, idDict[id1], idDict[id2])) for id2 in ids] kernelOutFile2.write(delim.join(tokens) + '\n')
def significance(r1, r2, statistic = 'roc') : """ report the statistical significance of the difference in error rates of a series of classification results of two classifiers using the Wilcoxon signed rank test. Returns: pvalue, (median1, median2) where: pvalue - the pvalue of the two sided Wilcoxon signed rank test; to get the pvalue of a one sided test divide the pvalue by two. (median1, median2) - the median of the statistics of the inputs r1 and r2. :Parameters: - `r1, r2` - both are either a list of Result classes, or a list of success rates - `statistic` - which measure of classifier success to plot values : 'roc', 'successRate', 'balancedSuccessRate' in order to specify parts of the roc curve you can use something like: 'roc50' or 'roc0.1' """ if type(r1) != type(r2) : raise ValueError, 'r1 and r2 do not have the same type' # if the two objects are dictionaries, then we can handle the case that # the lengths are not equal: if len(r1) != len(r2) : print 'unequal lengths for r1 and r2' if type(r1) != type({}) : raise ValueError, 'Cannot handle unequal length when it is not a dict' keys1 = r1.keys() keys2 = r2.keys() common = misc.intersect(keys1, keys2) r1new = {} r2new = {} for key in common : r1new[key] = r1[key] r2new[key] = r2[key] r1 = r1new r2 = r2new if type(r1) == type({}) : if r1.keys() != r2.keys() : raise ValueError, 'r1 and r2 do not have the same keys' I = r1.keys() else : I = range(len(r1)) if r1[I[0]].__class__.__name__ == 'Results' or r1[I[0]].__class__.__name__ == 'Container' : p1 = misc.extractAttribute(r1, statistic) p2 = misc.extractAttribute(r2, statistic) else : p1 = r1 p2 = r2 if type(p1) == type({}) : p1 = p1.values() p2 = p2.values() #import stats import salstat_stats test = salstat_stats.TwoSampleTests(p1, p2) test.SignedRanks (p1, p2) p = test.prob median1 = numpy.median(numpy.array(p1)) median2 = numpy.median(numpy.array(p2)) return p, (median1,median2)
def scatter(r1, r2, statistic = 'roc', x1Label = '', x2Label= '', fileName = None, **args) : """ a scatter plot for comparing the performance of two classifiers :Parameters: - `r1, r2` - both are either a list of Result classes, or a list of success rates / ROC scores - `statistic` - which measure of classifier success to plot values : 'roc', 'successRate', 'balancedSuccessRate' in order to specify parts of the roc curve you can use something like: 'roc50' or 'roc0.1' :Keywords: - `title` - the title of the plot """ if len(r1) != len(r2) : print 'unequal lengths for r1 and r2' if type(r1) != type({}) : raise ValueError, 'Cannot handle unequal length when it is not a dict' keys1 = r1.keys() keys2 = r2.keys() common = misc.intersect(keys1, keys2) r1new = {} r2new = {} for key in common : r1new[key] = r1[key] r2new[key] = r2[key] r1 = r1new r2 = r2new if type(r1) == type({}) and type(r2) == type({}) : I = r1.keys() else : I = range(len(r1)) if (r1[I[0]].__class__.__name__ == 'Results' or r1[I[0]].__class__.__name__ == 'Container') : p1 = misc.extractAttribute(r1, statistic) p2 = misc.extractAttribute(r2, statistic) else : p1 = r1 p2 = r2 if type(p1) == type({}) : p1 = p1.values() p2 = p2.values() from matplotlib import pylab x = numpy.arange(0,1,0.01) pylab.plot(p1, p2, 'bo',x,x, '-k') pylab.xlabel(x1Label, fontsize = 18) pylab.ylabel(x2Label, fontsize = 18) if 'title' in args : pylab.title(args['title'], fontsize = 18) pylab.show() if fileName is not None : pylab.savefig(fileName) pylab.close()