def testSaveParams(self): try: import sklearn except ImportError as error: return svm = LibSVM() svm.setC(10.5) svm.setEpsilon(12.1) svm.setErrorCost(1.8) svm.setSvmType("Epsilon_SVR") svm.setTermination(0.12) svm.setKernel("gaussian", 0.43) outputDir = PathDefaults.getOutputDir() fileName = outputDir + "test/testSvmParams" svm.saveParams(fileName) svm2 = LibSVM() svm2.loadParams(fileName) self.assertEquals(svm.getC(), 10.5) self.assertEquals(svm.getEpsilon(), 12.1) self.assertEqual(svm.getErrorCost(), 1.8) self.assertEqual(svm.getSvmType(), "Epsilon_SVR") self.assertEqual(svm.getTermination(), 0.12) self.assertEqual(svm.getKernel(), "gaussian") self.assertEqual(svm.getKernelParams(), 0.43)
def testSetErrorCost(self): try: import sklearn except ImportError as error: return numExamples = 1000 numFeatures = 100 eg = ExamplesGenerator() X, y = eg.generateBinaryExamples(numExamples, numFeatures) svm = LibSVM() C = 0.1 kernel = "linear" kernelParam = 0 svm.setKernel(kernel, kernelParam) svm.setC(C) svm.setErrorCost(0.1) svm.learnModel(X, y) predY = svm.classify(X) e1 = Evaluator.binaryErrorP(y, predY) svm.setErrorCost(0.9) svm.learnModel(X, y) predY = svm.classify(X) e2 = Evaluator.binaryErrorP(y, predY) self.assertTrue(e1 > e2)
def testSetSvmType(self): try: import sklearn except ImportError as error: return numExamples = 100 numFeatures = 10 X = numpy.random.randn(numExamples, numFeatures) X = Standardiser().standardiseArray(X) c = numpy.random.randn(numFeatures) y = numpy.dot(X, numpy.array([c]).T).ravel() + 1 y2 = numpy.array(y > 0, numpy.int32) * 2 - 1 svm = LibSVM() svm.setSvmType("Epsilon_SVR") self.assertEquals(svm.getType(), "Epsilon_SVR") #Try to get a good error Cs = 2**numpy.arange(-6, 4, dtype=numpy.float) epsilons = 2**numpy.arange(-6, 4, dtype=numpy.float) bestError = 10 for C in Cs: for epsilon in epsilons: svm.setEpsilon(epsilon) svm.setC(C) svm.learnModel(X, y) yp = svm.predict(X) if Evaluator.rootMeanSqError(y, yp) < bestError: bestError = Evaluator.rootMeanSqError(y, yp) self.assertTrue( bestError < Evaluator.rootMeanSqError(y, numpy.zeros(y.shape[0]))) svm.setSvmType("C_SVC") svm.learnModel(X, y2) yp2 = svm.predict(X) self.assertTrue(0 <= Evaluator.binaryError(y2, yp2) <= 1)
def testSetSvmType(self): try: import sklearn except ImportError as error: return numExamples = 100 numFeatures = 10 X = numpy.random.randn(numExamples, numFeatures) X = Standardiser().standardiseArray(X) c = numpy.random.randn(numFeatures) y = numpy.dot(X, numpy.array([c]).T).ravel() + 1 y2 = numpy.array(y > 0, numpy.int32)*2 -1 svm = LibSVM() svm.setSvmType("Epsilon_SVR") self.assertEquals(svm.getType(), "Epsilon_SVR") #Try to get a good error Cs = 2**numpy.arange(-6, 4, dtype=numpy.float) epsilons = 2**numpy.arange(-6, 4, dtype=numpy.float) bestError = 10 for C in Cs: for epsilon in epsilons: svm.setEpsilon(epsilon) svm.setC(C) svm.learnModel(X, y) yp = svm.predict(X) if Evaluator.rootMeanSqError(y, yp) < bestError: bestError = Evaluator.rootMeanSqError(y, yp) self.assertTrue(bestError < Evaluator.rootMeanSqError(y, numpy.zeros(y.shape[0]))) svm.setSvmType("C_SVC") svm.learnModel(X, y2) yp2 = svm.predict(X) self.assertTrue(0 <= Evaluator.binaryError(y2, yp2) <= 1)
def testSetEpsilon(self): """ Test out the parameter for the regressive SVM, vary epsilon and look at number of support vectors. """ try: import sklearn except ImportError as error: return svm = LibSVM() svm.setC(10.0) svm.setEpsilon(0.1) svm.setSvmType("Epsilon_SVR") numExamples = 100 numFeatures = 10 X = numpy.random.randn(numExamples, numFeatures) c = numpy.random.randn(numFeatures) y = numpy.dot(X, numpy.array([c]).T).ravel() + numpy.random.randn(100) svm.setEpsilon(1.0) svm.learnModel(X, y) numSV = svm.getModel().support_.shape svm.setEpsilon(0.5) svm.learnModel(X, y) numSV2 = svm.getModel().support_.shape svm.setEpsilon(0.01) svm.learnModel(X, y) numSV3 = svm.getModel().support_.shape #There should be fewer SVs as epsilon increases self.assertTrue(numSV < numSV2) self.assertTrue(numSV2 < numSV3)
cvGrid = learner.parallelSplitGrid(trainX, trainY, trainX, trainY, paramDict) meanCvGrid[methodInd, :] += cvGrid bestLearner = learner.getBestLearner(cvGrid, paramDict, trainX, trainY) predY = bestLearner.predict(testX) meanErrors[methodInd] += bestLearner.getMetricMethod()(testY, predY) #Compute norms tempMeanNorms = numpy.zeros(numParams) tempMeanSVs = numpy.zeros(numParams) for s, C in enumerate(Cs): for trainInds, testInds in idx: validX = trainX[trainInds, :] validY = trainY[trainInds] learner.setC(C) learner.learnModel(validX, validY) tempMeanNorms[s] += learner.weightNorm() tempMeanSVs[s] += learner.model.support_.shape[0] learner.learnModel(trainX, trainY) testMeanNorms[s] = learner.weightNorm() testMeanSVs[s] += learner.model.support_.shape[0] tempMeanNorms /= float(folds) meanNorms += tempMeanNorms tempMeanSVs /= float(folds) meanSVs+= tempMeanSVs numRealisations = float(numRealisations)
def testGetC(self): svm = LibSVM() svm.setC(10.0) C = svm.getC() self.assertTrue(C == 10.0)
class SvmEgoSimulator(AbstractDiffusionSimulator): """ A class which combines SVM classification with the EgoSimulation. There are methods to run modelSelection, train the SVM and then run the simulation. The simulation itself is run using EgoSimulator. """ def __init__(self, examplesFileName): """ Create the class by reading examples from a Matlab file. Instantiate the SVM and create a preprocesor to standarise examples to have zero mean and unit variance. """ self.examplesList = ExamplesList.readFromFile(examplesFileName) self.examplesList.setDefaultExamplesName("X") self.examplesList.setLabelsName("y") (freqs, items) = Util.histogram(self.examplesList.getSampledDataField("y").ravel()) logging.info("Distribution of labels: " + str((freqs, items))) logging.info("The base error rate is " + str(float(min(freqs))/self.examplesList.getNumExamples())) self.classifier = LibSVM() self.errorMethod = Evaluator.balancedError self.preprocessor = Standardiser() X = self.preprocessor.standardiseArray(self.examplesList.getDataField(self.examplesList.getDefaultExamplesName())) self.examplesList.overwriteDataField(self.examplesList.getDefaultExamplesName(), X) def getPreprocessor(self): """ Returns the preprocessor """ return self.preprocessor def sampleExamples(self, sampleSize): """ This function exists so that we can sample the same examples used in model selection and exclude them when running evaluateClassifier. """ self.examplesList.randomSubData(sampleSize) def modelSelection(self, Cs, kernel, kernelParams, errorCosts, folds, sampleSize): """ Perform model selection using an SVM """ Parameter.checkInt(sampleSize, 0, self.examplesList.getNumExamples()) Parameter.checkInt(folds, 0, sampleSize) Parameter.checkString(kernel, ["linear", "polynomial", "gaussian"]) Parameter.checkList(Cs, Parameter.checkFloat, [0.0, float("inf")]) Parameter.checkList(errorCosts, Parameter.checkFloat, [0.0, float("inf")]) #Perform model selection self.examplesList.randomSubData(sampleSize) (freqs, items) = Util.histogram(self.examplesList.getSampledDataField("y").ravel()) logging.info("Using " + str(sampleSize) + " examples for model selection") logging.info("Distribution of labels: " + str((freqs, items))) logging.info("List of Cs " + str(Cs)) logging.info("List of kernels " + str(kernel)) logging.info("List of kernelParams " + str(kernelParams)) logging.info("List of errorCosts " + str(errorCosts)) CVal, kernelParamVal, errorCost, error = self.classifier.cvModelSelection(self.examplesList, Cs, kernelParams, kernel, folds, errorCosts, self.errorMethod) logging.info("Model selection returned C = " + str(CVal) + " kernelParam = " + str(kernelParamVal) + " errorCost = " + str(errorCost) + " with error " + str(error)) return CVal, kernelParamVal, errorCost, error def evaluateClassifier(self, CVal, kernel, kernelParamVal, errorCost, folds, sampleSize, invert=True): """ Evaluate the SVM with the given parameters. Often model selection is done before this step and in that case, invert=True uses a sample excluding those used for model selection. """ Parameter.checkFloat(CVal, 0.0, float('inf')) Parameter.checkFloat(errorCost, 0.0, float('inf')) Parameter.checkString(kernel, ["linear", "polynomial", "gaussian"]) if kernel == "gaussian": Parameter.checkFloat(kernelParamVal, 0.0, float('inf')) elif kernel == "polynomial": Parameter.checkInt(kernelParamVal, 2, float('inf')) Parameter.checkInt(folds, 0, sampleSize) Parameter.checkInt(sampleSize, 0, self.examplesList.getNumExamples()) if invert: allIndices = numpy.array(list(range(0, self.examplesList.getNumExamples()))) testIndices = numpy.setdiff1d(allIndices, self.examplesList.getPermutationIndices()) testIndices = numpy.random.permutation(testIndices)[0:sampleSize] else: testIndices = Util.sampleWithoutReplacement(sampleSize, self.examplesList.getNumExamples()) logging.info("Using " + str(testIndices.shape[0]) + " examples for SVM evaluation") self.examplesList.setPermutationIndices(testIndices) self.classifier.setParams(C=CVal, kernel=kernel, kernelParam=kernelParamVal) self.classifier.setErrorCost(errorCost) (means, vars) = self.classifier.evaluateCv(self.examplesList, folds) logging.info("--- Classification evaluation ---") logging.info("Error on " + str(testIndices.shape[0]) + " examples is " + str(means[0]) + "(" + str(vars[0]) + ")") logging.info("Sensitivity (recall = TP/(TP+FN)): " + str(means[1]) + "(" + str(vars[1]) + ")") logging.info("Specificity (TN/TN+FP): " + str(means[2]) + "(" + str(vars[2]) + ")") logging.info("Error on positives: " + str(means[3]) + "(" + str(vars[3]) + ")") logging.info("Error on negatives: " + str(means[4]) + "(" + str(vars[4]) + ")") logging.info("Balanced error: " + str(means[5]) + "(" + str(vars[5]) + ")") return (means, vars) def trainClassifier(self, CVal, kernel, kernelParamVal, errorCost, sampleSize): Parameter.checkFloat(CVal, 0.0, float('inf')) Parameter.checkString(kernel, ["linear", "gaussian", "polynomial"]) Parameter.checkFloat(kernelParamVal, 0.0, float('inf')) Parameter.checkFloat(errorCost, 0.0, float('inf')) Parameter.checkInt(sampleSize, 0, self.examplesList.getNumExamples()) logging.info("Training SVM with C=" + str(CVal) + ", " + kernel + " kernel" + ", param=" + str(kernelParamVal) + ", sampleSize=" + str(sampleSize) + ", errorCost=" + str(errorCost)) self.examplesList.randomSubData(sampleSize) self.classifier.setC(C=CVal) self.classifier.setKernel(kernel=kernel, kernelParam=kernelParamVal) self.classifier.setErrorCost(errorCost) X = self.examplesList.getSampledDataField(self.examplesList.getDefaultExamplesName()) y = self.examplesList.getSampledDataField(self.examplesList.getLabelsName()) y = y.ravel() self.classifier.learnModel(X, y) return self.classifier def getWeights(self): return self.classifier.getWeights() def runSimulation(self, maxIterations): Parameter.checkInt(maxIterations, 1, float('inf')) #Notice that the data is preprocessed in the same way as the survey data egoSimulator = EgoSimulator(self.graph, self.classifier, self.preprocessor) totalInfo = numpy.zeros(maxIterations+1) totalInfo[0] = EgoUtils.getTotalInformation(self.graph) logging.info("Total number of people with information: " + str(totalInfo[0])) logging.info("--- Simulation Started ---") for i in range(0, maxIterations): logging.info("--- Iteration " + str(i) + " ---") self.graph = egoSimulator.advanceGraph() totalInfo[i+1] = EgoUtils.getTotalInformation(self.graph) logging.info("Total number of people with information: " + str(totalInfo[i+1])) #Compute distribution of ages etc. in alters alterIndices = egoSimulator.getAlters(i) alterAges = numpy.zeros(len(alterIndices)) alterGenders = numpy.zeros(len(alterIndices)) for j in range(0, len(alterIndices)): currentVertex = self.graph.getVertex(alterIndices[j]) alterAges[j] = currentVertex[self.egoQuestionIds.index(("Q5X", 0))] alterGenders[j] = currentVertex[self.egoQuestionIds.index(("Q4", 0))] (freqs, items) = Util.histogram(alterAges) logging.info("Distribution of ages " + str(freqs) + " " + str(items)) (freqs, items) = Util.histogram(alterGenders) logging.info("Distribution of genders " + str(freqs) + " " + str(items)) logging.info("--- Simulation Finished ---") return totalInfo, egoSimulator.getTransmissions() def getVertexFeatureDistribution(self, fIndex, vIndices=None): return self.graph.getVertexFeatureDistribution(fIndex, vIndices) def getPreProcessor(self): return self.preprocessor def getClassifier(self): return self.classifier preprocessor = None examplesList = None classifier = None graph = None edgeWeight = 1