def learnModel(self, X, y, folds=3): """ Train using the given examples and labels, however first conduct grid search in conjunction with cross validation to find the best parameters. We also conduct filtering with a variety of values. """ #Hard coding this is bad Cs = 2**numpy.arange(-2, 7, dtype=numpy.float) #Cs = numpy.array([0.1, 2.0]) if self.waveletInds == None: self.waveletInds = numpy.arange(X.shape[1]) nonWaveletInds = numpy.setdiff1d(numpy.arange(X.shape[1]), self.waveletInds) Xw = X[:, self.waveletInds] Xo = X[:, nonWaveletInds] featureInds = numpy.flipud(numpy.argsort(numpy.sum(Xw**2, 0))) meanAUCs = numpy.zeros((Cs.shape[0], self.candidatesN.shape[0])) stdAUCs = numpy.zeros((Cs.shape[0], self.candidatesN.shape[0])) #Standardise the data Xw = Standardiser().standardiseArray(Xw) Xo = Standardiser().standardiseArray(Xo) for i in range(Cs.shape[0]): for j in range(self.candidatesN.shape[0]): self.linearSVM.setC(Cs[i]) newX = numpy.c_[Xw[:, featureInds[0:self.candidatesN[j]]], Xo] meanAUCs[i, j], stdAUCs[i, j] = self.linearSVM.evaluateStratifiedCv(newX, y, folds, metricMethod=Evaluator.auc) (bestI, bestJ) = numpy.unravel_index(numpy.argmax(meanAUCs), meanAUCs.shape) self.linearSVM.setC(Cs[bestI]) self.featureInds = numpy.r_[self.waveletInds[featureInds[0:self.candidatesN[bestJ]]], nonWaveletInds] logging.debug("Best learner found: " + str(self.linearSVM) + " N:" + str(self.candidatesN[bestJ])) self.standardiser = Standardiser() newX = self.standardiser.standardiseArray(X[:, self.featureInds]) self.linearSVM.learnModel(newX, y)
def saveResults(self, leafRankGenerators, standardise=True): """ Compute the results and save them for a particular hormone. Does so for all leafranks """ j = 0 nonNaInds = self.YList[j][1] hormoneInd = self.hormoneInds[j] k = 2 if type(self.X) == numpy.ndarray: X = self.X[nonNaInds, :] else: X = self.X[j][nonNaInds, :] X = numpy.c_[X, self.ages[nonNaInds]] if standardise: X = Standardiser().standardiseArray(X) Y = hormoneInd[k] waveletInds = numpy.arange(X.shape[1]-1) logging.debug("Shape of examples: " + str(X.shape)) logging.debug("Distribution of labels: " + str(numpy.bincount(Y))) #pca = decomp.PCA(n_components=40) #X = pca.fit_transform(X) #print(X.shape) #Go through all the leafRanks for i in range(len(leafRankGenerators)): #Compute TreeRankForest here fileName = self.resultsDir + "TreeRankForest-" + self.hormoneNames[j] + "_" + str(k) + "-" + leafRankGenerators[i][1] + "-" + self.featuresName + ".dat" try: logging.debug("Computing file " + fileName) #treeRankForest = TreeRankForest(self.funcLeafRankGenerators[0][0](waveletInds)) treeRankForest = TreeRankForest(self.leafRankGenerators[0][0]) treeRankForest.setMaxDepth(10) treeRankForest.setNumTrees(5) #Setting this low definitely helps #treeRankForest.setFeatureSize(1.0) treeRankForest.setFeatureSize(0.05) #The following 2 lines definitely improve stability and the AUC treeRankForest.setSampleSize(1.0) #Setting this to true results in slightly worse results treeRankForest.setSampleReplace(True) mean, var = treeRankForest.evaluateStratifiedCv(X, Y, self.folds, metricMethod=Evaluator.auc) print(mean) #treeRank = TreeRank(self.leafRankGenerators[0][0]) #treeRank.setMaxDepth(self.maxDepth) #(bestParams, allMetrics, bestMetaDicts) = treeRank.evaluateCvOuter(X, Y, self.folds) #print(str(allMetrics)) #Util.savePickle(cvResults, fileName) except: logging.debug("Caught an error in the code ... skipping") raise else: logging.debug("File exists: " + fileName) return
def __init__(self, learningAlg, windowSize, preprocessor=Standardiser()): self.windowSize = windowSize self.learningAlg = learningAlg self.preprocessor = preprocessor self.printStep = 50
""" Compare the clustering methods in scikits.learn to see which ones are fastest and most accurate """ import time import numpy import sklearn.cluster as cluster from apgl.data.Standardiser import Standardiser import scipy.cluster.vq as vq numExamples = 10000 numFeatures = 500 X = numpy.random.rand(numExamples, numFeatures) X = Standardiser().standardiseArray(X) k = 10 numRuns = 10 maxIter = 100 tol = 10**-4 intialCentroids = X[0:k, :] #Quite fast print("Running scikits learn k means") clusterer = cluster.KMeans(k=k, n_init=numRuns, tol=tol, init=intialCentroids, max_iter=maxIter) start = time.clock()
numExamples = 100 numFeatures = 3 std = 0.1 V = numpy.random.rand(numExamples, numFeatures) V[0:20, :] = numpy.random.randn(20, numFeatures) * std V[0:20, 0:3] += numpy.array([1, 0.2, -1]) V[20:70, :] = numpy.random.randn(50, numFeatures) * std V[20:70, 0:3] += numpy.array([-0.5, 1, -1]) V[70:, :] = numpy.random.randn(30, numFeatures) * std V[70:, 0:3] += numpy.array([-0.3, 0.4, -0.1]) U = V - numpy.mean(V, 0) U = Standardiser().normaliseArray(U.T).T fig = plt.figure(0) ax = fig.add_subplot(111, projection='3d') ax.scatter(U[0:20, 0], U[0:20, 1], U[0:20, 2], c="red") ax.scatter(U[20:70, 0], U[20:70, 1], U[20:70, 2], c="blue") ax.scatter(U[70:, 0], U[70:, 1], U[70:, 2], c="green") UU = U.dot(U.T) #s, X = numpy.linalg.eig(UU) X, a, Y = numpy.linalg.svd(U) #Now compute true cluster error k = 3 kmeans = sklearn.cluster.KMeans(k) kmeans.fit(U)