예제 #1
0
    def learnModel(self, X, y, folds=3):
        """
        Train using the given examples and labels, however first conduct grid
        search in conjunction with cross validation to find the best parameters.
        We also conduct filtering with a variety of values. 
        """
        #Hard coding this is bad
        Cs = 2**numpy.arange(-2, 7, dtype=numpy.float)
        #Cs = numpy.array([0.1, 2.0])

        if self.waveletInds == None:
            self.waveletInds = numpy.arange(X.shape[1])

        nonWaveletInds = numpy.setdiff1d(numpy.arange(X.shape[1]),  self.waveletInds)

        Xw = X[:, self.waveletInds]
        Xo = X[:, nonWaveletInds]

        featureInds = numpy.flipud(numpy.argsort(numpy.sum(Xw**2, 0)))
        meanAUCs = numpy.zeros((Cs.shape[0], self.candidatesN.shape[0]))
        stdAUCs = numpy.zeros((Cs.shape[0], self.candidatesN.shape[0]))

        #Standardise the data
        Xw = Standardiser().standardiseArray(Xw)
        Xo = Standardiser().standardiseArray(Xo)

        for i in range(Cs.shape[0]):
            for j in range(self.candidatesN.shape[0]):
                self.linearSVM.setC(Cs[i])
                newX = numpy.c_[Xw[:, featureInds[0:self.candidatesN[j]]], Xo]
                meanAUCs[i, j], stdAUCs[i, j] = self.linearSVM.evaluateStratifiedCv(newX, y, folds, metricMethod=Evaluator.auc)

        (bestI, bestJ) = numpy.unravel_index(numpy.argmax(meanAUCs), meanAUCs.shape)
        self.linearSVM.setC(Cs[bestI])
        self.featureInds = numpy.r_[self.waveletInds[featureInds[0:self.candidatesN[bestJ]]], nonWaveletInds]
        logging.debug("Best learner found: " + str(self.linearSVM) + " N:" + str(self.candidatesN[bestJ]))

        self.standardiser = Standardiser()
        newX = self.standardiser.standardiseArray(X[:, self.featureInds])
        self.linearSVM.learnModel(newX, y)
예제 #2
0
    def saveResults(self, leafRankGenerators, standardise=True):
        """
        Compute the results and save them for a particular hormone. Does so for all
        leafranks
        """
        j = 0
        nonNaInds = self.YList[j][1]
        hormoneInd = self.hormoneInds[j]

        k = 2
        if type(self.X) == numpy.ndarray:
            X = self.X[nonNaInds, :]
        else:
            X = self.X[j][nonNaInds, :]
        X = numpy.c_[X, self.ages[nonNaInds]]
        if standardise:
            X = Standardiser().standardiseArray(X)
        Y = hormoneInd[k]

        waveletInds = numpy.arange(X.shape[1]-1)

        logging.debug("Shape of examples: " + str(X.shape))
        logging.debug("Distribution of labels: " + str(numpy.bincount(Y)))

        #pca = decomp.PCA(n_components=40)
        #X = pca.fit_transform(X)
        #print(X.shape)

        #Go through all the leafRanks
        for i in range(len(leafRankGenerators)):
            #Compute TreeRankForest here
            fileName = self.resultsDir + "TreeRankForest-" + self.hormoneNames[j] + "_" + str(k) + "-" +  leafRankGenerators[i][1]  + "-" + self.featuresName +  ".dat"
            try:
                logging.debug("Computing file " + fileName)
                #treeRankForest = TreeRankForest(self.funcLeafRankGenerators[0][0](waveletInds))
                treeRankForest = TreeRankForest(self.leafRankGenerators[0][0])
                treeRankForest.setMaxDepth(10)
                treeRankForest.setNumTrees(5)
                #Setting this low definitely helps 
                #treeRankForest.setFeatureSize(1.0)
                treeRankForest.setFeatureSize(0.05)
                #The following 2 lines definitely improve stability and the AUC 
                treeRankForest.setSampleSize(1.0)
                #Setting this to true results in slightly worse results 
                treeRankForest.setSampleReplace(True)
                mean, var = treeRankForest.evaluateStratifiedCv(X, Y, self.folds, metricMethod=Evaluator.auc)
                print(mean)

                #treeRank = TreeRank(self.leafRankGenerators[0][0])
                #treeRank.setMaxDepth(self.maxDepth)
                #(bestParams, allMetrics, bestMetaDicts) = treeRank.evaluateCvOuter(X, Y, self.folds)
                #print(str(allMetrics))


                #Util.savePickle(cvResults, fileName)
            except:
                logging.debug("Caught an error in the code ... skipping")
                raise
            else:
                logging.debug("File exists: " + fileName)
        return
예제 #3
0
 def __init__(self, learningAlg, windowSize, preprocessor=Standardiser()):
     
     self.windowSize = windowSize
     self.learningAlg = learningAlg
     self.preprocessor = preprocessor 
     self.printStep = 50 
예제 #4
0
"""
Compare the clustering methods in scikits.learn to see which ones are fastest
and most accurate 
"""
import time
import numpy
import sklearn.cluster as cluster
from apgl.data.Standardiser import Standardiser
import scipy.cluster.vq as vq

numExamples = 10000
numFeatures = 500

X = numpy.random.rand(numExamples, numFeatures)
X = Standardiser().standardiseArray(X)

k = 10
numRuns = 10
maxIter = 100
tol = 10**-4

intialCentroids = X[0:k, :]

#Quite fast
print("Running scikits learn k means")
clusterer = cluster.KMeans(k=k,
                           n_init=numRuns,
                           tol=tol,
                           init=intialCentroids,
                           max_iter=maxIter)
start = time.clock()
예제 #5
0
numExamples = 100
numFeatures = 3
std = 0.1

V = numpy.random.rand(numExamples, numFeatures)
V[0:20, :] = numpy.random.randn(20, numFeatures) * std
V[0:20, 0:3] += numpy.array([1, 0.2, -1])

V[20:70, :] = numpy.random.randn(50, numFeatures) * std
V[20:70, 0:3] += numpy.array([-0.5, 1, -1])

V[70:, :] = numpy.random.randn(30, numFeatures) * std
V[70:, 0:3] += numpy.array([-0.3, 0.4, -0.1])

U = V - numpy.mean(V, 0)
U = Standardiser().normaliseArray(U.T).T

fig = plt.figure(0)
ax = fig.add_subplot(111, projection='3d')
ax.scatter(U[0:20, 0], U[0:20, 1], U[0:20, 2], c="red")
ax.scatter(U[20:70, 0], U[20:70, 1], U[20:70, 2], c="blue")
ax.scatter(U[70:, 0], U[70:, 1], U[70:, 2], c="green")

UU = U.dot(U.T)
#s, X = numpy.linalg.eig(UU)
X, a, Y = numpy.linalg.svd(U)

#Now compute true cluster error
k = 3
kmeans = sklearn.cluster.KMeans(k)
kmeans.fit(U)