def testParallelPen(self): #Check if penalisation == inf when treeSize < gamma numExamples = 100 X, y = data.make_regression(numExamples) learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2) paramDict = {} paramDict["setGamma"] = numpy.array(numpy.round(2**numpy.arange(1, 10, 0.5)-1), dtype=numpy.int) folds = 3 alpha = 1.0 Cvs = numpy.array([(folds-1)*alpha]) idx = Sampling.crossValidation(folds, X.shape[0]) resultsList = learner.parallelPen(X, y, idx, paramDict, Cvs) learner, trainErrors, currentPenalties = resultsList[0] learner.setGamma(2**10) treeSize = 0 #Let's work out the size of the unpruned tree for trainInds, testInds in idx: trainX = X[trainInds, :] trainY = y[trainInds] learner.learnModel(trainX, trainY) treeSize += learner.tree.size treeSize /= float(folds) self.assertTrue(numpy.isinf(currentPenalties[paramDict["setGamma"]>treeSize]).all()) self.assertTrue(not numpy.isinf(currentPenalties[paramDict["setGamma"]<treeSize]).all())
def testLearningRate(self): numExamples = 100 trainX, trainY = data.make_regression(numExamples) trainX = Standardiser().normaliseArray(trainX) trainY = Standardiser().normaliseArray(trainY) learner = DecisionTreeLearner(pruneType="CART", maxDepth=20, minSplit=1) foldsSet = numpy.arange(2, 7, 2) gammas = numpy.array(numpy.round(2**numpy.arange(1, 8, 1)-1), dtype=numpy.int) paramDict = {} paramDict["setGamma"] = gammas betaGrid = learner.learningRate(trainX, trainY, foldsSet, paramDict) #Compute beta more directly numParams = gammas.shape[0] sampleSize = trainX.shape[0] sampleMethod = Sampling.crossValidation Cvs = numpy.array([1]) penalties = numpy.zeros((foldsSet.shape[0], numParams)) betas = numpy.zeros(gammas.shape[0]) for k in range(foldsSet.shape[0]): folds = foldsSet[k] logging.debug("Folds " + str(folds)) idx = sampleMethod(folds, trainX.shape[0]) #Now try penalisation resultsList = learner.parallelPen(trainX, trainY, idx, paramDict, Cvs) bestLearner, trainErrors, currentPenalties = resultsList[0] penalties[k, :] = currentPenalties for i in range(gammas.shape[0]): inds = numpy.logical_and(numpy.isfinite(penalties[:, i]), penalties[:, i]>0) tempPenalties = penalties[:, i][inds] tempfoldsSet = numpy.array(foldsSet, numpy.float)[inds] if tempPenalties.shape[0] > 1: x = numpy.log((tempfoldsSet-1)/tempfoldsSet*sampleSize) y = numpy.log(tempPenalties)+numpy.log(tempfoldsSet) clf = linear_model.LinearRegression() clf.fit(numpy.array([x]).T, y) betas[i] = clf.coef_[0] betas = -betas nptst.assert_array_equal(betaGrid, betas)
treeLeaveSizes = numpy.zeros(numParams) for j in range(numRealisations): print("") logging.debug("j=" + str(j)) trainX, trainY, testX, testY = loadMethod(dataDir, datasetName, j) logging.debug("Loaded dataset with " + str(trainX.shape) + " train and " + str(testX.shape) + " test examples") trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize] trainX = trainX[trainInds,:] trainY = trainY[trainInds] idx = sampleMethod(folds, trainX.shape[0]) #Now try penalisation resultsList = learner.parallelPen(trainX, trainY, idx, paramDict, Cvs) bestLearner, trainErrors, currentPenalties = resultsList[0] meanPenalties[k] += currentPenalties meanTrainError += trainErrors predY = bestLearner.predict(testX) meanErrors[k] += bestLearner.getMetricMethod()(testY, predY) #Compute ideal penalties and error on training data meanIdealPenalities[k] += learner.parallelPenaltyGrid(trainX, trainY, testX, testY, paramDict) for i in range(len(paramDict["setGamma"])): allError = 0 learner.setGamma(paramDict["setGamma"][i]) for trainInds, testInds in idx: validX = trainX[trainInds, :] validY = trainY[trainInds]