def testScaleArray(self): numExamples = 10 numFeatures = 3 X = numpy.random.rand(numExamples, numFeatures) preprocessor = Standardiser() Xs = preprocessor.scaleArray(X) minVals = numpy.amin(Xs, 0) maxVals = numpy.amax(Xs, 0) tol = 10**-6 self.assertTrue(numpy.linalg.norm(minVals + numpy.ones(X.shape[1])) <= tol) self.assertTrue(numpy.linalg.norm(maxVals - numpy.ones(X.shape[1])) <= tol) #Now test stanrdisation on other matrix X = numpy.array([[2, 1], [-1, -2], [0.6, 0.3]]) preprocessor = Standardiser() Xs = preprocessor.scaleArray(X) X2 = numpy.array([[2, 1], [-1, -2], [0.6, 0.3], [4, 2]]) Xs2 = preprocessor.scaleArray(X2) self.assertTrue(numpy.linalg.norm(Xs2[0:3, :] - Xs) < tol)
def matrixSimilarity(self, V1, V2): """ Compute a vertex similarity matrix C, such that the ijth entry is the matching score between V1_i and V2_j, where larger is a better match. """ X = numpy.r_[V1, V2] standardiser = Standardiser() X = standardiser.normaliseArray(X) V1 = X[0:V1.shape[0], :] V2 = X[V1.shape[0]:, :] #print(X) #Extend arrays with zeros to make them the same size #if V1.shape[0] < V2.shape[0]: # V1 = Util.extendArray(V1, V2.shape, numpy.min(V1)) #elif V2.shape[0] < V1.shape[0]: # V2 = Util.extendArray(V2, V1.shape, numpy.min(V2)) #Let's compute C as the distance between vertices #Distance is bounded by 1 D = Util.distanceMatrix(V1, V2) maxD = numpy.max(D) minD = numpy.min(D) if (maxD-minD) != 0: C = (maxD - D)/(maxD-minD) else: C = numpy.ones((V1.shape[0], V2.shape[0])) return C
def cluster(self, graph): """ Take a graph and cluster using the method in "On spectral clusering: analysis and algorithm" by Ng et al., 2001. :param graph: the graph to cluster :type graph: :class:`apgl.graph.AbstractMatrixGraph` :returns: An array of size graph.getNumVertices() of cluster membership """ L = graph.normalisedLaplacianSym() omega, Q = numpy.linalg.eig(L) inds = numpy.argsort(omega) #First normalise rows, then columns standardiser = Standardiser() V = standardiser.normaliseArray(Q[:, inds[0:self.k]].T).T V = vq.whiten(V) #Using kmeans2 here seems to result in a high variance #in the quality of clustering. Therefore stick to kmeans centroids, clusters = vq.kmeans(V, self.k, iter=self.numIterKmeans) clusters, distortion = vq.vq(V, centroids) return clusters
def loadData(): """ Return the raw spectra and the MDS transformed data as well as the DataFrame for the MDS data. """ utilsLib = importr('utils') dataDir = PathDefaults.getDataDir() + "metabolomic/" fileName = dataDir + "data.RMN.total.6.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") maxNMRIndex = 951 X = df.rx(robjects.IntVector(range(1, maxNMRIndex))) X = numpy.array(X).T #Load age and normalise (missing values are assinged the mean) ages = numpy.array(df.rx(robjects.StrVector(["Age"]))).ravel() meanAge = numpy.mean(ages[numpy.logical_not(numpy.isnan(ages))]) ages[numpy.isnan(ages)] = meanAge ages = Standardiser().standardiseArray(ages) Xs = X.copy() standardiser = Standardiser() Xs = standardiser.standardiseArray(X) fileName = dataDir + "data.sportsmen.log.AP.1.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") maxNMRIndex = 419 X2 = df.rx(robjects.IntVector(range(1, maxNMRIndex))) X2 = numpy.array(X2).T #Load the OPLS corrected files fileName = dataDir + "IGF1.log.OSC.1.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") minNMRIndex = 22 maxNMRIndex = 441 Xopls1 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex))) Xopls1 = numpy.array(Xopls1).T fileName = dataDir + "cort.log.OSC.1.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") minNMRIndex = 20 maxNMRIndex = 439 Xopls2 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex))) Xopls2 = numpy.array(Xopls2).T fileName = dataDir + "testo.log.OSC.1.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") minNMRIndex = 22 maxNMRIndex = 441 Xopls3 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex))) Xopls3 = numpy.array(Xopls3).T #Let's load all the label data here labelNames = MetabolomicsUtils.getLabelNames() YList = MetabolomicsUtils.createLabelList(df, labelNames) return X, X2, Xs, (Xopls1, Xopls2, Xopls3), YList, ages, df
class DecisionTreeF(AbstractFunctionalPredictor): def __init__(self): super(DecisionTreeF, self).__init__() self.decisionTree = DecisionTree() def learnModel(self, X, y, folds=3): """ Train using the given examples and labels, however first conduct grid search in conjunction with cross validation to find the best parameters. We also conduct filtering with a variety of values. """ if self.waveletInds == None: self.waveletInds = numpy.arange(X.shape[1]) nonWaveletInds = numpy.setdiff1d(numpy.arange(X.shape[1]), self.waveletInds) Xw = X[:, self.waveletInds] Xo = X[:, nonWaveletInds] featureInds = numpy.flipud(numpy.argsort(numpy.sum(Xw**2, 0))) meanAUCs = numpy.zeros(self.candidatesN.shape[0]) stdAUCs = numpy.zeros(self.candidatesN.shape[0]) #Standardise the data Xw = Standardiser().standardiseArray(Xw) Xo = Standardiser().standardiseArray(Xo) for i in range(self.candidatesN.shape[0]): newX = numpy.c_[Xw[:, featureInds[0:self.candidatesN[i]]], Xo] meanAUCs[i], stdAUCs[i] = self.decisionTree.evaluateStratifiedCv(newX, y, folds, metricMethod=Evaluator.auc) bestI = numpy.argmax(meanAUCs) self.featureInds = numpy.r_[self.waveletInds[featureInds[0:self.candidatesN[bestI]]], nonWaveletInds] logging.debug("Best learner found: " + str(self.decisionTree) + " N:" + str(self.candidatesN[bestI])) self.standardiser = Standardiser() newX = self.standardiser.standardiseArray(X[:, self.featureInds]) self.decisionTree.learnModel(newX, y) def predict(self, X): newX = self.standardiser.standardiseArray(X[:, self.featureInds]) return self.decisionTree.predict(newX) @staticmethod def generate(waveletInds=None): """ Generate a classifier which does a grid search. """ def generatorFunc(): decisionTree = DecisionTreeF() decisionTree.setWaveletInds(waveletInds) return decisionTree return generatorFunc def setWeight(self, weight): self.decisionTree.setWeight(weight)
def testUnstandardiseArray(self): numExamples = 10 numFeatures = 3 tol = 10**-6 preprocessor = Standardiser() #Test an everyday matrix X = numpy.random.rand(numExamples, numFeatures) Xs = preprocessor.standardiseArray(X) X2 = preprocessor.unstandardiseArray(Xs) self.assertTrue(numpy.linalg.norm(X2 - X) < tol)
def testLearnModel(self): numExamples = 50 numFeatures = 200 preprocessor = Standardiser() X = numpy.random.randn(numExamples, numFeatures) X = preprocessor.standardiseArray(X) c = numpy.random.rand(numFeatures) y = numpy.dot(X, c) tol = 0.05 kernel = LinearKernel() lmbda = 0.0001 predictor = KernelShiftRegression(kernel, lmbda) alpha, b = predictor.learnModel(X, y) predY = predictor.predict(X) self.assertTrue(Evaluator.rootMeanSqError(y, predY) < tol) #Try increasing y y = y + 5 lmbda = 0.2 predictor = KernelShiftRegression(kernel, lmbda) alpha, b = predictor.learnModel(X, y) predY = predictor.predict(X) self.assertTrue(numpy.abs(b - 5) < 0.1) self.assertTrue(Evaluator.rootMeanSqError(y, predY) < 0.1) #Try making prediction for multilabel Y C = numpy.random.rand(numFeatures, numFeatures) Y = numpy.dot(X, C) predictor = KernelShiftRegression(kernel, lmbda) alpha, b = predictor.learnModel(X, Y) predY = predictor.predict(X) self.assertTrue(Evaluator.rootMeanSqError(Y, predY) < 0.1) #Now, shift the data s = numpy.random.rand(numFeatures) Y = Y + s predictor = KernelShiftRegression(kernel, lmbda) alpha, b = predictor.learnModel(X, Y) predY = predictor.predict(X) self.assertTrue(numpy.linalg.norm(b - s) < 0.1) self.assertTrue(Evaluator.rootMeanSqError(Y, predY) < 0.1)
def testStandardiseArray(self): numExamples = 10 numFeatures = 3 preprocessor = Standardiser() #Test an everyday matrix X = numpy.random.rand(numExamples, numFeatures) Xs = preprocessor.standardiseArray(X) self.assertAlmostEquals(numpy.sum(Xs), 0, places=3) self.assertAlmostEquals(numpy.sum(Xs*Xs), numFeatures, places=3) #Now, test on a portion of a matrix Xss = preprocessor.standardiseArray(X[1:5, :]) self.assertTrue((Xss == Xs[1:5, :]).all())
def learnModel(self, X, y, folds=3): """ Train using the given examples and labels, however first conduct grid search in conjunction with cross validation to find the best parameters. We also conduct filtering with a variety of values. """ if self.waveletInds == None: self.waveletInds = numpy.arange(X.shape[1]) nonWaveletInds = numpy.setdiff1d(numpy.arange(X.shape[1]), self.waveletInds) Xw = X[:, self.waveletInds] Xo = X[:, nonWaveletInds] featureInds = numpy.flipud(numpy.argsort(numpy.sum(Xw**2, 0))) meanAUCs = numpy.zeros(self.candidatesN.shape[0]) stdAUCs = numpy.zeros(self.candidatesN.shape[0]) #Standardise the data Xw = Standardiser().standardiseArray(Xw) Xo = Standardiser().standardiseArray(Xo) for i in range(self.candidatesN.shape[0]): newX = numpy.c_[Xw[:, featureInds[0:self.candidatesN[i]]], Xo] meanAUCs[i], stdAUCs[i] = self.decisionTree.evaluateStratifiedCv(newX, y, folds, metricMethod=Evaluator.auc) bestI = numpy.argmax(meanAUCs) self.featureInds = numpy.r_[self.waveletInds[featureInds[0:self.candidatesN[bestI]]], nonWaveletInds] logging.debug("Best learner found: " + str(self.decisionTree) + " N:" + str(self.candidatesN[bestI])) self.standardiser = Standardiser() newX = self.standardiser.standardiseArray(X[:, self.featureInds]) self.decisionTree.learnModel(newX, y)
def testCentreArray(self): numExamples = 10 numFeatures = 3 preprocessor = Standardiser() #Test an everyday matrix X = numpy.random.rand(numExamples, numFeatures) Xc = preprocessor.centreArray(X) centreV = preprocessor.getCentreVector() self.assertAlmostEquals(numpy.sum(Xc), 0, places=3) self.assertTrue((X-centreV == Xc).all()) #Now take out 3 rows of X, normalise and compare to normalised X Xs = X[0:3, :] Xsc = preprocessor.centreArray(Xs) self.assertTrue((Xsc == Xc[0:3, :]).all())
def testNormaliseArray(self): numExamples = 10 numFeatures = 3 preprocessor = Standardiser() #Test an everyday matrix X = numpy.random.rand(numExamples, numFeatures) Xn = preprocessor.normaliseArray(X) normV = preprocessor.getNormVector() self.assertAlmostEquals(numpy.sum(Xn*Xn), numFeatures, places=3) norms = numpy.sum(Xn*Xn, 0) for i in range(0, norms.shape[0]): self.assertAlmostEquals(norms[i], 1, places=3) self.assertTrue((X/normV == Xn).all()) #Zero one column preprocessor = Standardiser() X[:, 1] = 0 Xn = preprocessor.normaliseArray(X) normV = preprocessor.getNormVector() self.assertAlmostEquals(numpy.sum(Xn*Xn), numFeatures-1, places=3) self.assertTrue((X/normV == Xn).all()) #Now take out 3 rows of X, normalise and compare to normalised X Xs = X[0:3, :] Xsn = preprocessor.normaliseArray(Xs) self.assertTrue((Xsn == Xn[0:3, :]).all())
def testClassify(self): numExamples = 10 numFeatures = 20 X = numpy.random.randn(numExamples, numFeatures) y = numpy.sign(numpy.random.randn(numExamples)) logging.debug(y) preprocessor = Standardiser() X = preprocessor.standardiseArray(X) tol = 10**-5 lmbda = 1.0 kernel = LinearKernel() predictor = KernelRidgeRegression(kernel, lmbda) predictor.learnModel(X, y) classY, predY = predictor.classify(X) self.assertTrue(numpy.logical_or(classY == 1, classY == -1).all() )
def learnModel(self, X, y, folds=3): """ Train using the given examples and labels, however first conduct grid search in conjunction with cross validation to find the best parameters. We also conduct filtering with a variety of values. """ #Hard coding this is bad Cs = 2**numpy.arange(-2, 7, dtype=numpy.float) #Cs = numpy.array([0.1, 2.0]) if self.waveletInds == None: self.waveletInds = numpy.arange(X.shape[1]) nonWaveletInds = numpy.setdiff1d(numpy.arange(X.shape[1]), self.waveletInds) Xw = X[:, self.waveletInds] Xo = X[:, nonWaveletInds] featureInds = numpy.flipud(numpy.argsort(numpy.sum(Xw**2, 0))) meanAUCs = numpy.zeros((Cs.shape[0], self.candidatesN.shape[0])) stdAUCs = numpy.zeros((Cs.shape[0], self.candidatesN.shape[0])) #Standardise the data Xw = Standardiser().standardiseArray(Xw) Xo = Standardiser().standardiseArray(Xo) for i in range(Cs.shape[0]): for j in range(self.candidatesN.shape[0]): self.linearSVM.setC(Cs[i]) newX = numpy.c_[Xw[:, featureInds[0:self.candidatesN[j]]], Xo] meanAUCs[i, j], stdAUCs[i, j] = self.linearSVM.evaluateStratifiedCv(newX, y, folds, metricMethod=Evaluator.auc) (bestI, bestJ) = numpy.unravel_index(numpy.argmax(meanAUCs), meanAUCs.shape) self.linearSVM.setC(Cs[bestI]) self.featureInds = numpy.r_[self.waveletInds[featureInds[0:self.candidatesN[bestJ]]], nonWaveletInds] logging.debug("Best learner found: " + str(self.linearSVM) + " N:" + str(self.candidatesN[bestJ])) self.standardiser = Standardiser() newX = self.standardiser.standardiseArray(X[:, self.featureInds]) self.linearSVM.learnModel(newX, y)
def testLearnModel2(self): numExamples = 200 numFeatures = 100 X = numpy.random.randn(numExamples, numFeatures) y = numpy.random.randn(numExamples) preprocessor = Standardiser() X = preprocessor.standardiseArray(X) tol = 10**-3 kernel = LinearKernel() #Try using a low-rank matrix lmbda = 0.001 predictor = KernelShiftRegression(kernel, lmbda) alpha, b = predictor.learnModel(X, y) predY = predictor.predict(X) logging.debug((numpy.linalg.norm(y))) logging.debug((numpy.linalg.norm(predY - y)))
def __init__(self, graph, predictor): """ Create the class by reading a graph with labelled edges. Instantiate the predictor and create a preprocesor to standarise examples to have zero mean and unit variance. """ self.graph = graph self.predictor = predictor self.errorMethod = Evaluator.balancedError #Note: We modify the vertices of the input graph!!!! logging.warn("About to modify (normalise) the vertices of the graph.") self.preprocessor = Standardiser() V = graph.getVertexList().getVertices(graph.getAllVertexIds()) V = self.preprocessor.normaliseArray(V) graph.getVertexList().setVertices(V)
def learnModel(self, X, y, folds=3): """ Train using the given examples and labels, however first conduct grid search in conjunction with cross validation to find the best parameters. We also conduct filtering with a variety of values. """ #Hard coding this is bad Cs = 2**numpy.arange(-2, 6, dtype=numpy.float) gammas = 2**numpy.arange(-5, 0, dtype=numpy.float) if self.waveletInds == None: self.waveletInds = numpy.arange(X.shape[1]) nonWaveletInds = numpy.setdiff1d(numpy.arange(X.shape[1]), self.waveletInds) Xw = X[:, self.waveletInds] Xo = X[:, nonWaveletInds] featureInds = numpy.flipud(numpy.argsort(numpy.sum(Xw**2, 0))) meanAUCs = numpy.zeros((Cs.shape[0], gammas.shape[0], self.candidatesN.shape[0])) stdAUCs = numpy.zeros((Cs.shape[0], gammas.shape[0], self.candidatesN.shape[0])) #Standardise the data Xw = Standardiser().standardiseArray(Xw) Xo = Standardiser().standardiseArray(Xo) for i in range(Cs.shape[0]): for j in range(gammas.shape[0]): for k in range(self.candidatesN.shape[0]): self.SVC.setC(Cs[i]) self.SVC.setGamma(gammas[j]) newX = numpy.c_[Xw[:, featureInds[0:self.candidatesN[k]]], Xo] meanAUCs[i, j, k], stdAUCs[i, j, k] = self.SVC.evaluateStratifiedCv(newX, y, folds, metricMethod=Evaluator.auc) (bestI, bestJ, bestK) = numpy.unravel_index(numpy.argmax(meanAUCs), meanAUCs.shape) self.SVC.setC(Cs[bestI]) self.SVC.setGamma(gammas[bestJ]) self.featureInds = numpy.r_[self.waveletInds[featureInds[0:self.candidatesN[bestK]]], nonWaveletInds] logging.debug("Best learner found: " + str(self.SVC) + " N:" + str(self.candidatesN[bestK])) self.standardiser = Standardiser() newX = self.standardiser.standardiseArray(X[:, self.featureInds]) self.SVC.learnModel(newX, y)
def __init__(self, examplesFileName): """ Create the class by reading examples from a Matlab file. Instantiate the SVM and create a preprocesor to standarise examples to have zero mean and unit variance. """ self.examplesList = ExamplesList.readFromFile(examplesFileName) self.examplesList.setDefaultExamplesName("X") self.examplesList.setLabelsName("y") (freqs, items) = Util.histogram(self.examplesList.getSampledDataField("y").ravel()) logging.info("Distribution of labels: " + str((freqs, items))) logging.info("The base error rate is " + str(float(min(freqs))/self.examplesList.getNumExamples())) self.classifier = LibSVM() self.errorMethod = Evaluator.balancedError self.preprocessor = Standardiser() X = self.preprocessor.standardiseArray(self.examplesList.getDataField(self.examplesList.getDefaultExamplesName())) self.examplesList.overwriteDataField(self.examplesList.getDefaultExamplesName(), X)
class SvmEgoSimulator(AbstractDiffusionSimulator): """ A class which combines SVM classification with the EgoSimulation. There are methods to run modelSelection, train the SVM and then run the simulation. The simulation itself is run using EgoSimulator. """ def __init__(self, examplesFileName): """ Create the class by reading examples from a Matlab file. Instantiate the SVM and create a preprocesor to standarise examples to have zero mean and unit variance. """ self.examplesList = ExamplesList.readFromFile(examplesFileName) self.examplesList.setDefaultExamplesName("X") self.examplesList.setLabelsName("y") (freqs, items) = Util.histogram(self.examplesList.getSampledDataField("y").ravel()) logging.info("Distribution of labels: " + str((freqs, items))) logging.info("The base error rate is " + str(float(min(freqs))/self.examplesList.getNumExamples())) self.classifier = LibSVM() self.errorMethod = Evaluator.balancedError self.preprocessor = Standardiser() X = self.preprocessor.standardiseArray(self.examplesList.getDataField(self.examplesList.getDefaultExamplesName())) self.examplesList.overwriteDataField(self.examplesList.getDefaultExamplesName(), X) def getPreprocessor(self): """ Returns the preprocessor """ return self.preprocessor def sampleExamples(self, sampleSize): """ This function exists so that we can sample the same examples used in model selection and exclude them when running evaluateClassifier. """ self.examplesList.randomSubData(sampleSize) def modelSelection(self, Cs, kernel, kernelParams, errorCosts, folds, sampleSize): """ Perform model selection using an SVM """ Parameter.checkInt(sampleSize, 0, self.examplesList.getNumExamples()) Parameter.checkInt(folds, 0, sampleSize) Parameter.checkString(kernel, ["linear", "polynomial", "gaussian"]) Parameter.checkList(Cs, Parameter.checkFloat, [0.0, float("inf")]) Parameter.checkList(errorCosts, Parameter.checkFloat, [0.0, float("inf")]) #Perform model selection self.examplesList.randomSubData(sampleSize) (freqs, items) = Util.histogram(self.examplesList.getSampledDataField("y").ravel()) logging.info("Using " + str(sampleSize) + " examples for model selection") logging.info("Distribution of labels: " + str((freqs, items))) logging.info("List of Cs " + str(Cs)) logging.info("List of kernels " + str(kernel)) logging.info("List of kernelParams " + str(kernelParams)) logging.info("List of errorCosts " + str(errorCosts)) CVal, kernelParamVal, errorCost, error = self.classifier.cvModelSelection(self.examplesList, Cs, kernelParams, kernel, folds, errorCosts, self.errorMethod) logging.info("Model selection returned C = " + str(CVal) + " kernelParam = " + str(kernelParamVal) + " errorCost = " + str(errorCost) + " with error " + str(error)) return CVal, kernelParamVal, errorCost, error def evaluateClassifier(self, CVal, kernel, kernelParamVal, errorCost, folds, sampleSize, invert=True): """ Evaluate the SVM with the given parameters. Often model selection is done before this step and in that case, invert=True uses a sample excluding those used for model selection. """ Parameter.checkFloat(CVal, 0.0, float('inf')) Parameter.checkFloat(errorCost, 0.0, float('inf')) Parameter.checkString(kernel, ["linear", "polynomial", "gaussian"]) if kernel == "gaussian": Parameter.checkFloat(kernelParamVal, 0.0, float('inf')) elif kernel == "polynomial": Parameter.checkInt(kernelParamVal, 2, float('inf')) Parameter.checkInt(folds, 0, sampleSize) Parameter.checkInt(sampleSize, 0, self.examplesList.getNumExamples()) if invert: allIndices = numpy.array(list(range(0, self.examplesList.getNumExamples()))) testIndices = numpy.setdiff1d(allIndices, self.examplesList.getPermutationIndices()) testIndices = numpy.random.permutation(testIndices)[0:sampleSize] else: testIndices = Util.sampleWithoutReplacement(sampleSize, self.examplesList.getNumExamples()) logging.info("Using " + str(testIndices.shape[0]) + " examples for SVM evaluation") self.examplesList.setPermutationIndices(testIndices) self.classifier.setParams(C=CVal, kernel=kernel, kernelParam=kernelParamVal) self.classifier.setErrorCost(errorCost) (means, vars) = self.classifier.evaluateCv(self.examplesList, folds) logging.info("--- Classification evaluation ---") logging.info("Error on " + str(testIndices.shape[0]) + " examples is " + str(means[0]) + "(" + str(vars[0]) + ")") logging.info("Sensitivity (recall = TP/(TP+FN)): " + str(means[1]) + "(" + str(vars[1]) + ")") logging.info("Specificity (TN/TN+FP): " + str(means[2]) + "(" + str(vars[2]) + ")") logging.info("Error on positives: " + str(means[3]) + "(" + str(vars[3]) + ")") logging.info("Error on negatives: " + str(means[4]) + "(" + str(vars[4]) + ")") logging.info("Balanced error: " + str(means[5]) + "(" + str(vars[5]) + ")") return (means, vars) def trainClassifier(self, CVal, kernel, kernelParamVal, errorCost, sampleSize): Parameter.checkFloat(CVal, 0.0, float('inf')) Parameter.checkString(kernel, ["linear", "gaussian", "polynomial"]) Parameter.checkFloat(kernelParamVal, 0.0, float('inf')) Parameter.checkFloat(errorCost, 0.0, float('inf')) Parameter.checkInt(sampleSize, 0, self.examplesList.getNumExamples()) logging.info("Training SVM with C=" + str(CVal) + ", " + kernel + " kernel" + ", param=" + str(kernelParamVal) + ", sampleSize=" + str(sampleSize) + ", errorCost=" + str(errorCost)) self.examplesList.randomSubData(sampleSize) self.classifier.setC(C=CVal) self.classifier.setKernel(kernel=kernel, kernelParam=kernelParamVal) self.classifier.setErrorCost(errorCost) X = self.examplesList.getSampledDataField(self.examplesList.getDefaultExamplesName()) y = self.examplesList.getSampledDataField(self.examplesList.getLabelsName()) y = y.ravel() self.classifier.learnModel(X, y) return self.classifier def getWeights(self): return self.classifier.getWeights() def runSimulation(self, maxIterations): Parameter.checkInt(maxIterations, 1, float('inf')) #Notice that the data is preprocessed in the same way as the survey data egoSimulator = EgoSimulator(self.graph, self.classifier, self.preprocessor) totalInfo = numpy.zeros(maxIterations+1) totalInfo[0] = EgoUtils.getTotalInformation(self.graph) logging.info("Total number of people with information: " + str(totalInfo[0])) logging.info("--- Simulation Started ---") for i in range(0, maxIterations): logging.info("--- Iteration " + str(i) + " ---") self.graph = egoSimulator.advanceGraph() totalInfo[i+1] = EgoUtils.getTotalInformation(self.graph) logging.info("Total number of people with information: " + str(totalInfo[i+1])) #Compute distribution of ages etc. in alters alterIndices = egoSimulator.getAlters(i) alterAges = numpy.zeros(len(alterIndices)) alterGenders = numpy.zeros(len(alterIndices)) for j in range(0, len(alterIndices)): currentVertex = self.graph.getVertex(alterIndices[j]) alterAges[j] = currentVertex[self.egoQuestionIds.index(("Q5X", 0))] alterGenders[j] = currentVertex[self.egoQuestionIds.index(("Q4", 0))] (freqs, items) = Util.histogram(alterAges) logging.info("Distribution of ages " + str(freqs) + " " + str(items)) (freqs, items) = Util.histogram(alterGenders) logging.info("Distribution of genders " + str(freqs) + " " + str(items)) logging.info("--- Simulation Finished ---") return totalInfo, egoSimulator.getTransmissions() def getVertexFeatureDistribution(self, fIndex, vIndices=None): return self.graph.getVertexFeatureDistribution(fIndex, vIndices) def getPreProcessor(self): return self.preprocessor def getClassifier(self): return self.classifier preprocessor = None examplesList = None classifier = None graph = None edgeWeight = 1
def testLearnModel(self): numExamples = 50 numFeatures = 200 X = numpy.random.randn(numExamples, numFeatures) y = numpy.random.randn(numExamples) preprocessor = Standardiser() X = preprocessor.standardiseArray(X) tol = 10**-3 kernel = LinearKernel() #Compare Linear kernel with linear ridge regression lmbda = 0.1 predictor = KernelRidgeRegression(kernel, lmbda) alpha = predictor.learnModel(X, y) predY = predictor.predict(X) K = numpy.dot(X, X.T) alpha2 = numpy.dot(numpy.linalg.inv(K+lmbda*numpy.eye(numExamples)), y) predY2 = X.dot(numpy.linalg.inv(numpy.dot(X.T, X) + lmbda*numpy.eye(numFeatures))).dot(X.T).dot(y) #logging.debug(numpy.linalg.norm(alpha - alpha2)) self.assertTrue(numpy.linalg.norm(alpha - alpha2) < tol) self.assertTrue(numpy.linalg.norm(predY - predY2) < tol) lmbda = 0.5 predictor = KernelRidgeRegression(kernel, lmbda) alpha = predictor.learnModel(X, y) predY = predictor.predict(X) K = numpy.dot(X, X.T) alpha2 = numpy.dot(numpy.linalg.inv(K+lmbda*numpy.eye(numExamples)), y) predY2 = X.dot(numpy.linalg.inv(numpy.dot(X.T, X) + lmbda*numpy.eye(numFeatures))).dot(X.T).dot(y) self.assertTrue(numpy.linalg.norm(alpha - alpha2) < tol) self.assertTrue(numpy.linalg.norm(predY - predY2) < tol) #Now test on an alternative test set numTestExamples = 50 testX = numpy.random.randn(numTestExamples, numFeatures) predictor = KernelRidgeRegression(kernel, lmbda) alpha = predictor.learnModel(X, y) predY = predictor.predict(testX) K = numpy.dot(X, X.T) alpha2 = numpy.dot(numpy.linalg.inv(K+lmbda*numpy.eye(numExamples)), y) predY2 = testX.dot(numpy.linalg.inv(numpy.dot(X.T, X) + lmbda*numpy.eye(numFeatures))).dot(X.T).dot(y) self.assertTrue(numpy.linalg.norm(alpha - alpha2) < tol) self.assertTrue(numpy.linalg.norm(predY - predY2) < tol) #Use the method against a multi-label example Y = numpy.random.randn(numExamples, numFeatures) alpha = predictor.learnModel(X, Y) self.assertTrue(alpha.shape == (numExamples, numFeatures))
def testAdvanceGraph3(self): """ This test will learn from a set of ego and alter pairs, then we will make predictions on the pairs and see the results. The we test if the same results are present in a simulation. """ dataDir = PathDefaults.getDataDir() + "infoDiffusion/" matFileName = dataDir + "EgoAlterTransmissions1000.mat" examplesList = ExamplesList.readFromMatFile(matFileName) examplesList.setDefaultExamplesName("X") examplesList.setLabelsName("y") logging.debug(("Number of y = +1: " + str(sum(examplesList.getSampledDataField("y") == 1)))) logging.debug(("Number of y = -1: " + str(sum(examplesList.getSampledDataField("y") == -1)))) #Standardise the examples preprocessor = Standardiser() X = examplesList.getDataField(examplesList.getDefaultExamplesName()) X = preprocessor.standardiseArray(X) examplesList.overwriteDataField(examplesList.getDefaultExamplesName(), X) classifier = MlpySVM(kernel='linear', kp=1, C=32.0) y = examplesList.getDataField("y") classifier.learnModel(X, y) predY = classifier.classify(X) logging.debug(("Number of y = +1: " + str(sum(examplesList.getSampledDataField("y") == 1)))) logging.debug(("Number of y = -1: " + str(sum(examplesList.getSampledDataField("y") == -1)))) sampledY = examplesList.getSampledDataField(examplesList.getLabelsName()).ravel() error = mlpy.err(sampledY, predY) sensitivity = mlpy.sens(sampledY, predY) specificity = mlpy.spec(sampledY, predY) errorP = mlpy.errp(sampledY, predY) errorN = mlpy.errn(sampledY, predY) logging.debug("--- Classification evaluation ---") logging.debug(("Error on " + str(examplesList.getNumExamples()) + " examples is " + str(error))) logging.debug(("Sensitivity (recall = TP/(TP+FN)): " + str(sensitivity))) logging.debug(("Specificity (TN/TN+FP): " + str(specificity))) logging.debug(("Error on positives: " + str(errorP))) logging.debug(("Error on negatives: " + str(errorN))) sGraph = EgoUtils.graphFromMatFile(matFileName) #Notice that the data is preprocessed in the same way as the survey data egoSimulator = EgoSimulator(sGraph, classifier, preprocessor) totalInfo = EgoUtils.getTotalInformation(sGraph) logging.debug(("Total number of people with information: " + str(totalInfo))) self.assertEquals(totalInfo, 1000) sGraph = egoSimulator.advanceGraph() totalInfo = EgoUtils.getTotalInformation(sGraph) logging.debug(("Total number of people with information: " + str(totalInfo))) self.assertEquals(totalInfo, 1000 + sum(predY == 1)) altersList = egoSimulator.getAlters(0) predictedAlters = numpy.nonzero(predY == 1)[0] self.assertTrue((altersList == predictedAlters*2+1).all())
def clusterFromIterator(self, graphListIterator, verbose=False): """ Find a set of clusters for the graphs given by the iterator. If verbose is true the each iteration is timed and bounded the results are returned as lists. The difference between a weight matrix and the previous one should be positive. """ clustersList = [] decompositionTimeList = [] kMeansTimeList = [] boundList = [] i = 0 for subW in graphListIterator: if __debug__: Parameter.checkSymmetric(subW) if self.logStep and i % self.logStep == 0: logging.debug("Graph index: " + str(i)) logging.debug("Clustering graph of size " + str(subW.shape)) if self.alg!="efficientNystrom": ABBA = GraphUtils.shiftLaplacian(subW) # --- Eigen value decomposition --- startTime = time.time() if self.alg=="IASC": if i % self.T != 0: omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q) if self.computeBound: inds = numpy.flipud(numpy.argsort(omega)) Q = Q[:, inds] omega = omega[inds] bounds = self.pertBound(omega, Q, omegaKbot, AKbot, self.k2) #boundList.append([i, bounds[0], bounds[1]]) #Now use accurate values of norm of R and delta rank = Util.rank(ABBA.todense()) gamma, U = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0]) #logging.debug("gamma=" + str(gamma)) bounds2 = self.realBound(omega, Q, gamma, AKbot, self.k2) boundList.append([i, bounds[0], bounds[1], bounds2[0], bounds2[1]]) else: logging.debug("Computing exact eigenvectors") self.storeInformation(subW, ABBA) if self.computeBound: #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0])) rank = Util.rank(ABBA.todense()) omega, Q = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0]) inds = numpy.flipud(numpy.argsort(omega)) omegaKbot = omega[inds[self.k2:]] QKbot = Q[:, inds[self.k2:]] AKbot = (QKbot*omegaKbot).dot(QKbot.T) omegaSort = numpy.flipud(numpy.sort(omega)) else: omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0])) elif self.alg == "nystrom": omega, Q = Nystrom.eigpsd(ABBA, self.k3) elif self.alg == "exact": omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k1, ABBA.shape[0]-1), which="LM", ncv = min(15*self.k1, ABBA.shape[0])) elif self.alg == "efficientNystrom": omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1) elif self.alg == "randomisedSvd": Q, omega, R = RandomisedSVD.svd(ABBA, self.k4) else: raise ValueError("Invalid Algorithm: " + str(self.alg)) decompositionTimeList.append(time.time()-startTime) if self.alg=="IASC": self.storeInformation(subW, ABBA) # --- Kmeans --- startTime = time.time() inds = numpy.flipud(numpy.argsort(omega)) standardiser = Standardiser() #For some very strange reason we get an overflow when computing the #norm of the rows of Q even though its elements are bounded by 1. #We'll ignore it for now try: V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T except FloatingPointError as e: logging.warn("FloatingPointError: " + str(e)) V = VqUtils.whiten(V) if i == 0: centroids, distortion = vq.kmeans(V, self.k1, iter=self.nb_iter_kmeans) else: centroids = self.findCentroids(V, clusters[:subW.shape[0]]) if centroids.shape[0] < self.k1: nb_missing_centroids = self.k1 - centroids.shape[0] random_centroids = V[numpy.random.randint(0, V.shape[0], nb_missing_centroids),:] centroids = numpy.vstack((centroids, random_centroids)) centroids, distortion = vq.kmeans(V, centroids) #iter can only be 1 clusters, distortion = vq.vq(V, centroids) kMeansTimeList.append(time.time()-startTime) clustersList.append(clusters) #logging.debug("subW.shape: " + str(subW.shape)) #logging.debug("len(clusters): " + str(len(clusters))) #from apgl.util.ProfileUtils import ProfileUtils #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB") if ProfileUtils.memory() > 10**9: ProfileUtils.memDisplay(locals()) i += 1 if verbose: return clustersList, numpy.array((decompositionTimeList, kMeansTimeList)).T, boundList else: return clustersList
class EgoNetworkSimulator(AbstractDiffusionSimulator): """ A class which combines Ego network prediction with simulating information transmission within a simulated social network. """ def __init__(self, graph, predictor): """ Create the class by reading a graph with labelled edges. Instantiate the predictor and create a preprocesor to standarise examples to have zero mean and unit variance. """ self.graph = graph self.predictor = predictor self.errorMethod = Evaluator.balancedError #Note: We modify the vertices of the input graph!!!! logging.warn("About to modify (normalise) the vertices of the graph.") self.preprocessor = Standardiser() V = graph.getVertexList().getVertices(graph.getAllVertexIds()) V = self.preprocessor.normaliseArray(V) graph.getVertexList().setVertices(V) def getPreprocessor(self): """ Returns the preprocessor """ return self.preprocessor def sampleEdges(self, sampleSize): """ This function exists so that we can sample the same examples used in model selection and exclude them when running evaluateClassifier. """ edges = self.graph.getAllEdges() trainInds = numpy.random.permutation(edges.shape[0])[0:sampleSize] trainEdges = edges[trainInds, :] trainGraph = SparseGraph(self.graph.getVertexList(), self.graph.isUndirected()) trainGraph.addEdges(trainEdges, self.graph.getEdgeValues(trainEdges)) logging.info("Randomly sampled " + str(sampleSize) + " edges") return trainGraph def modelSelection(self, paramList, paramFunc, folds, errorFunc, sampleSize): """ Perform model selection using an edge label predictor. """ Parameter.checkInt(folds, 0, sampleSize) Parameter.checkInt(sampleSize, 0, self.graph.getNumEdges()) #trainGraph = self.sampleEdges(sampleSize) trainGraph = self.graph #Perform model selection meanErrs, stdErrs = self.predictor.cvModelSelection(trainGraph, paramList, paramFunc, folds, errorFunc) logging.info("Model selection errors:" + str(meanErrs)) logging.info("Model selection stds:" + str(stdErrs)) logging.info("Model selection best parameters:" + str(paramList[numpy.argmin(meanErrs)])) return paramList[numpy.argmin(meanErrs)], paramFunc, meanErrs[numpy.argmin(meanErrs)] def evaluateClassifier(self, params, paramFuncs, folds, errorFunc, sampleSize, invert=True): """ Evaluate the predictor with the given parameters. Often model selection is done before this step and in that case, invert=True uses a sample excluding those used for model selection. Return a set of errors for each """ Parameter.checkInt(folds, 0, sampleSize) Parameter.checkInt(sampleSize, 0, self.graph.getNumEdges()) trainGraph = self.sampleEdges(sampleSize) return self.predictor.cvError(trainGraph, params, paramFuncs, folds, errorFunc) def trainClassifier(self, params, paramFuncs, sampleSize): for j in range(len(params)): paramFuncs[j](params[j]) trainGraph = self.sampleEdges(sampleSize) self.predictor.learnModel(trainGraph) return self.predictor def runSimulation(self, maxIterations): Parameter.checkInt(maxIterations, 1, float('inf')) #Notice that the data is preprocessed in the same way as the survey data egoSimulator = EgoSimulator(self.graph, self.predictor, self.preprocessor) totalInfo = numpy.zeros(maxIterations+1) totalInfo[0] = EgoUtils.getTotalInformation(self.graph) logging.info("Total number of people with information: " + str(totalInfo[0])) logging.info("--- Simulation Started ---") for i in range(0, maxIterations): logging.info("--- Iteration " + str(i) + " ---") self.graph = egoSimulator.advanceGraph() totalInfo[i+1] = EgoUtils.getTotalInformation(self.graph) logging.info("Total number of people with information: " + str(totalInfo[i+1])) #Compute distribution of ages etc. in alters alterIndices = egoSimulator.getAlters(i) alterAges = numpy.zeros(len(alterIndices)) alterGenders = numpy.zeros(len(alterIndices)) for j in range(0, len(alterIndices)): currentVertex = self.graph.getVertex(alterIndices[j]) alterAges[j] = currentVertex[self.egoQuestionIds.index(("Q5X", 0))] alterGenders[j] = currentVertex[self.egoQuestionIds.index(("Q4", 0))] (freqs, items) = Util.histogram(alterAges) logging.info("Distribution of ages " + str(freqs) + " " + str(items)) (freqs, items) = Util.histogram(alterGenders) logging.info("Distribution of genders " + str(freqs) + " " + str(items)) logging.info("--- Simulation Finished ---") return totalInfo, egoSimulator.getTransmissions() def getVertexFeatureDistribution(self, fIndex, vIndices=None): return self.graph.getVertexFeatureDistribution(fIndex, vIndices) def getPreProcessor(self): return self.preprocessor def getClassifier(self): return self.predictor preprocessor = None examplesList = None predictor = None graph = None edgeWeight = 1
X = vectoriser.fit_transform(documentList) print(vectoriser.get_feature_names()) corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False) id2WordDict = dict(zip(range(len(vectoriser.get_feature_names())), vectoriser.get_feature_names())) k = 10 logging.getLogger('gensim').setLevel(logging.INFO) lda = LdaModel(corpus, num_topics=k, id2word=id2WordDict, chunksize=1000, distributed=False) index = gensim.similarities.docsim.SparseMatrixSimilarity(lda[corpus], num_features=k) newX = vectoriser.transform(["graph"]) newX = [(i, newX[0, i])for i in newX.nonzero()[1]] result = lda[newX] similarities = index[result] similarities = sorted(enumerate(similarities), key=lambda item: -item[1]) print(similarities) #Compute Helliger distance result = [i[1] for i in result] newX = scipy.sparse.csc_matrix(result) distances = SparseUtils.hellingerDistances(index.index, newX) print(1 - distances) #Try cosine metric X = Standardiser().normaliseArray(numpy.array(index.index.todense()).T).T newX = numpy.array(newX.todense()) similarities = X.dot(newX.T).flatten() print(similarities)
numExamples = 100 numFeatures = 3 std = 0.1 V = numpy.random.rand(numExamples, numFeatures) V[0:20, :] = numpy.random.randn(20, numFeatures) * std V[0:20, 0:3] += numpy.array([1, 0.2, -1]) V[20:70, :] = numpy.random.randn(50, numFeatures) * std V[20:70, 0:3] += numpy.array([-0.5, 1, -1]) V[70:, :] = numpy.random.randn(30, numFeatures) * std V[70:, 0:3] += numpy.array([-0.3, 0.4, -0.1]) U = V - numpy.mean(V, 0) U = Standardiser().normaliseArray(U.T).T fig = plt.figure(0) ax = fig.add_subplot(111, projection='3d') ax.scatter(U[0:20, 0], U[0:20, 1], U[0:20, 2], c="red") ax.scatter(U[20:70, 0], U[20:70, 1], U[20:70, 2], c="blue") ax.scatter(U[70:, 0], U[70:, 1], U[70:, 2], c="green") UU = U.dot(U.T) #s, X = numpy.linalg.eig(UU) X, a, Y = numpy.linalg.svd(U) #Now compute true cluster error k = 3 kmeans = sklearn.cluster.KMeans(k) kmeans.fit(U)
defaultTol = 10**-5 allEdges = decayGraph.getAllEdges() edgeDecays = decayGraph.getEdgeValues(allEdges) #-1 is no transmission, +1 is transmission binaryEdges = numpy.array(edgeDecays > defaultTol, numpy.int)*2 -1 logging.info("Total number of transmisions: " + str(numpy.sum(binaryEdges[binaryEdges==1]))) #Center the decays values logging.warn("Centering the edge decays for transmissions.") meanDecay = numpy.mean(edgeDecays[binaryEdges==1]) logging.warn("Mean edge value :" + str(meanDecay)) edgeDecays[binaryEdges==1] = edgeDecays[binaryEdges==1] - numpy.mean(edgeDecays[binaryEdges==1]) logging.warn("About to modify (standardise) the vertices of the graph.") preprocessor = Standardiser() V = decayGraph.getVertexList().getVertices(decayGraph.getAllVertexIds()) V = preprocessor.standardiseArray(V) decayGraph.getVertexList().setVertices(V) #Take a subgraph #edgeSampleSize = 20000 edgeSampleSize = 1000 transEdges = allEdges[0:edgeSampleSize, :] transEdgeLabels = binaryEdges[0:edgeSampleSize] transGraph = SparseGraph(decayGraph.getVertexList(), False) transGraph.addEdges(transEdges, transEdgeLabels) logging.info("Created graph of binary transmissions") logging.info("Transmission graph: " + str(transGraph))
dataDir = PathDefaults.getDataDir() + "metabolomic/" X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData() waveletStr = 'db4' mode = "cpd" maxLevel = 10 errors = numpy.zeros(maxLevel) numFeatures = numpy.zeros(maxLevel) level = 10 waveletStrs = ["haar", "db4", "db8"] #The variances are very similar across different wavelets for waveletStr in waveletStrs: Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode) standardiser = Standardiser() Xw = standardiser.centreArray(Xw) w, V = numpy.linalg.eig(Xw.dot(Xw.T)) w = numpy.flipud(numpy.sort(w)) variances = [] variances.append(numpy.sum(w[0:1])/numpy.sum(w)) variances.append(numpy.sum(w[0:5])/numpy.sum(w)) variances.append(numpy.sum(w[0:10])/numpy.sum(w)) variances.append(numpy.sum(w[0:15])/numpy.sum(w)) variances.append(numpy.sum(w[0:20])/numpy.sum(w)) variances.append(numpy.sum(w[0:25])/numpy.sum(w)) variances.append(numpy.sum(w[0:50])/numpy.sum(w)) variances.append(numpy.sum(w[0:100])/numpy.sum(w)) variances.append(numpy.sum(w[0:150])/numpy.sum(w)) variances.append(numpy.sum(w[0:200])/numpy.sum(w))
def saveResults(self, labelIndex): """ Compute the results and save them for a particular hormone. Does so for all leafranks """ folds = 5 if type(self.X) == numpy.ndarray: X = self.X[self.YList[labelIndex][1], :] else: X = self.X[labelIndex][self.YList[labelIndex][1], :] X = numpy.c_[X, self.ages[self.YList[labelIndex][1]]] Y = self.YList[labelIndex][0] numExamples = X.shape[0] logging.debug("Shape of examples: " + str(X.shape)) standardiserX = Standardiser() X = standardiserX.standardiseArray(X) standardiserY = Standardiser() Y = standardiserY.standardiseArray(Y) #We need to include the ROC curves indexList = Sampling.crossValidation(folds, numExamples) splitFunction = lambda trainX, trainY: Sampling.crossValidation(folds, trainX.shape[0]) #We need a metric to minimise def invMeanAUC(predY, testY): return 1 - self.meanAUC(predY, testY, labelIndex, standardiserY) metricMethods = [invMeanAUC] #Now create a learnerIterator based on the SVM Cs = 2**numpy.arange(-8, 2, dtype=numpy.float) gammas = 2**numpy.arange(-10, 0, dtype=numpy.float) epsilons = 2**numpy.arange(-5, 0, dtype=numpy.float) fileName = self.resultsDir + self.labelNames[labelIndex] + "-svr_rbf-" + self.featuresName + ".dat" learnerIterator = [] for C in Cs: for gamma in gammas: for epsilon in epsilons: learner = svm.SVR(C=C, gamma=gamma, epsilon=epsilon) learner.learnModel = learner.fit learnerIterator.append(learner) self.saveResult(X, Y, indexList, splitFunction, learnerIterator, metricMethods, fileName, labelIndex, standardiserY) #Try the polynomial SVM fileName = self.resultsDir + self.labelNames[labelIndex] + "-svr_poly-" + self.featuresName + ".dat" degrees = numpy.array([2, 3]) for C in Cs: for degree in degrees: for epsilon in epsilons: learner = svm.SVR(kernel='poly', C=C, degree=degree, epsilon=epsilon) learner.learnModel = learner.fit learnerIterator.append(learner) self.saveResult(X, Y, indexList, splitFunction, learnerIterator, metricMethods, fileName, labelIndex, standardiserY) #Now try Lasso and ElasticNet fileName = self.resultsDir + self.labelNames[labelIndex] + "-lasso-" + self.featuresName + ".dat" alphas = 2**numpy.arange(-9, 0, dtype=numpy.float) learnerIterator = [] for alpha in alphas: learner = linear_model.Lasso(alpha = alpha) learner.learnModel = learner.fit learnerIterator.append(learner) self.saveResult(X, Y, indexList, splitFunction, learnerIterator, metricMethods, fileName, labelIndex, standardiserY)
class LinearSvmFGs(AbstractFunctionalPredictor): def __init__(self): super(LinearSvmFGs, self).__init__() self.linearSVM = LinearSVM() def learnModel(self, X, y, folds=3): """ Train using the given examples and labels, however first conduct grid search in conjunction with cross validation to find the best parameters. We also conduct filtering with a variety of values. """ #Hard coding this is bad Cs = 2**numpy.arange(-2, 7, dtype=numpy.float) #Cs = numpy.array([0.1, 2.0]) if self.waveletInds == None: self.waveletInds = numpy.arange(X.shape[1]) nonWaveletInds = numpy.setdiff1d(numpy.arange(X.shape[1]), self.waveletInds) Xw = X[:, self.waveletInds] Xo = X[:, nonWaveletInds] featureInds = numpy.flipud(numpy.argsort(numpy.sum(Xw**2, 0))) meanAUCs = numpy.zeros((Cs.shape[0], self.candidatesN.shape[0])) stdAUCs = numpy.zeros((Cs.shape[0], self.candidatesN.shape[0])) #Standardise the data Xw = Standardiser().standardiseArray(Xw) Xo = Standardiser().standardiseArray(Xo) for i in range(Cs.shape[0]): for j in range(self.candidatesN.shape[0]): self.linearSVM.setC(Cs[i]) newX = numpy.c_[Xw[:, featureInds[0:self.candidatesN[j]]], Xo] meanAUCs[i, j], stdAUCs[i, j] = self.linearSVM.evaluateStratifiedCv(newX, y, folds, metricMethod=Evaluator.auc) (bestI, bestJ) = numpy.unravel_index(numpy.argmax(meanAUCs), meanAUCs.shape) self.linearSVM.setC(Cs[bestI]) self.featureInds = numpy.r_[self.waveletInds[featureInds[0:self.candidatesN[bestJ]]], nonWaveletInds] logging.debug("Best learner found: " + str(self.linearSVM) + " N:" + str(self.candidatesN[bestJ])) self.standardiser = Standardiser() newX = self.standardiser.standardiseArray(X[:, self.featureInds]) self.linearSVM.learnModel(newX, y) def predict(self, X): newX = self.standardiser.standardiseArray(X[:, self.featureInds]) return self.linearSVM.predict(newX) @staticmethod def generate(waveletInds=None): """ Generate a classifier which does a grid search. """ def generatorFunc(): linearSvm = LinearSvmFGs() linearSvm.setWaveletInds(waveletInds) return linearSvm return generatorFunc def setWeight(self, weight): self.linearSVM.setWeight(weight)
""" Compare the clustering methods in scikits.learn to see which ones are fastest and most accurate """ import time import numpy import sklearn.cluster as cluster from apgl.data.Standardiser import Standardiser import scipy.cluster.vq as vq numExamples = 10000 numFeatures = 500 X = numpy.random.rand(numExamples, numFeatures) X = Standardiser().standardiseArray(X) k = 10 numRuns = 10 maxIter = 100 tol = 10**-4 intialCentroids = X[0:k, :] #Quite fast print("Running scikits learn k means") clusterer = cluster.KMeans(k=k, n_init=numRuns, tol=tol, init=intialCentroids, max_iter=maxIter) start = time.clock()
def __init__(self, learningAlg, windowSize, preprocessor=Standardiser()): self.windowSize = windowSize self.learningAlg = learningAlg self.preprocessor = preprocessor self.printStep = 50
numExamples = 100 numFeatures = 3 std = 0.1 V = numpy.random.rand(numExamples, numFeatures) V[0:20 ,:] = numpy.random.randn(20, numFeatures)*std V[0:20 ,0:3] += numpy.array([1, 0.2, -1]) V[20:70 ,:] = numpy.random.randn(50, numFeatures)*std V[20:70, 0:3] += numpy.array([-0.5, 1, -1]) V[70: ,:] = numpy.random.randn(30, numFeatures)*std V[70:, 0:3] += numpy.array([-0.3, 0.4, -0.1]) U = V - numpy.mean(V, 0) U = Standardiser().normaliseArray(U.T).T fig = plt.figure(0) ax = fig.add_subplot(111, projection='3d') ax.scatter(U[0:20, 0], U[0:20, 1], U[0:20, 2], c="red") ax.scatter(U[20:70, 0], U[20:70, 1], U[20:70, 2], c="blue") ax.scatter(U[70:, 0], U[70:, 1], U[70:, 2], c="green") UU = U.dot(U.T) #s, X = numpy.linalg.eig(UU) X, a, Y = numpy.linalg.svd(U) #Now compute true cluster error k = 3 kmeans = sklearn.cluster.KMeans(k) kmeans.fit(U)
def saveResults(self, leafRankGenerators, standardise=True): """ Compute the results and save them for a particular hormone. Does so for all leafranks """ j = 0 nonNaInds = self.YList[j][1] hormoneInd = self.hormoneInds[j] k = 2 if type(self.X) == numpy.ndarray: X = self.X[nonNaInds, :] else: X = self.X[j][nonNaInds, :] X = numpy.c_[X, self.ages[nonNaInds]] if standardise: X = Standardiser().standardiseArray(X) Y = hormoneInd[k] waveletInds = numpy.arange(X.shape[1]-1) logging.debug("Shape of examples: " + str(X.shape)) logging.debug("Distribution of labels: " + str(numpy.bincount(Y))) #pca = decomp.PCA(n_components=40) #X = pca.fit_transform(X) #print(X.shape) #Go through all the leafRanks for i in range(len(leafRankGenerators)): #Compute TreeRankForest here fileName = self.resultsDir + "TreeRankForest-" + self.hormoneNames[j] + "_" + str(k) + "-" + leafRankGenerators[i][1] + "-" + self.featuresName + ".dat" try: logging.debug("Computing file " + fileName) #treeRankForest = TreeRankForest(self.funcLeafRankGenerators[0][0](waveletInds)) treeRankForest = TreeRankForest(self.leafRankGenerators[0][0]) treeRankForest.setMaxDepth(10) treeRankForest.setNumTrees(5) #Setting this low definitely helps #treeRankForest.setFeatureSize(1.0) treeRankForest.setFeatureSize(0.05) #The following 2 lines definitely improve stability and the AUC treeRankForest.setSampleSize(1.0) #Setting this to true results in slightly worse results treeRankForest.setSampleReplace(True) mean, var = treeRankForest.evaluateStratifiedCv(X, Y, self.folds, metricMethod=Evaluator.auc) print(mean) #treeRank = TreeRank(self.leafRankGenerators[0][0]) #treeRank.setMaxDepth(self.maxDepth) #(bestParams, allMetrics, bestMetaDicts) = treeRank.evaluateCvOuter(X, Y, self.folds) #print(str(allMetrics)) #Util.savePickle(cvResults, fileName) except: logging.debug("Caught an error in the code ... skipping") raise else: logging.debug("File exists: " + fileName) return
#The set of edges indexed by zeros is the contact graph #The ones indexed by 1 is the infection graph edgeTypeIndex1 = 0 edgeTypeIndex2 = 1 sGraphContact = graph.getSparseGraph(edgeTypeIndex1) sGraphInfect = graph.getSparseGraph(edgeTypeIndex2) numpy.set_printoptions(precision=3, suppress=True) logging.info("Statistics over Verticies ") logging.info("===============================") #Other measures : infection period of tree, infection types #PCA to find variance of data, correlation matrix, center matrix X = graph.getVertexList().copy().getVertices(list(range(0, graph.getNumVertices()))) standardiser = Standardiser() X = standardiser.standardiseArray(X) centerArray = standardiser.getCentreVector() C = numpy.dot(X.T, X) print((Latex.array2DToRows(numpy.reshape(centerArray, (5, 8))))) C2 = numpy.abs(C - numpy.eye(X.shape[1])) C2[numpy.tril_indices(C.shape[0])] = 0 inds = numpy.flipud(numpy.argsort(C2, None)) numEls = 10 for i in range(numEls): corr = "%.3f" % C[numpy.unravel_index(inds[i], C2.shape)]