Example #1
0
    def testScaleArray(self):
        numExamples = 10
        numFeatures = 3
        X = numpy.random.rand(numExamples, numFeatures)

        preprocessor = Standardiser()
        Xs = preprocessor.scaleArray(X)

        minVals = numpy.amin(Xs, 0)
        maxVals = numpy.amax(Xs, 0)

        tol = 10**-6
        self.assertTrue(numpy.linalg.norm(minVals + numpy.ones(X.shape[1])) <= tol)
        self.assertTrue(numpy.linalg.norm(maxVals - numpy.ones(X.shape[1])) <= tol)

        #Now test stanrdisation on other matrix

        X = numpy.array([[2, 1], [-1, -2], [0.6, 0.3]])
        preprocessor = Standardiser()
        Xs = preprocessor.scaleArray(X)

        X2 = numpy.array([[2, 1], [-1, -2], [0.6, 0.3], [4, 2]])
        Xs2 = preprocessor.scaleArray(X2)

        self.assertTrue(numpy.linalg.norm(Xs2[0:3, :] - Xs) < tol)
Example #2
0
 def matrixSimilarity(self, V1, V2): 
     """
     Compute a vertex similarity matrix C, such that the ijth entry is the matching 
     score between V1_i and V2_j, where larger is a better match. 
     """  
     X = numpy.r_[V1, V2]
     standardiser = Standardiser()
     X = standardiser.normaliseArray(X)
     
     V1 = X[0:V1.shape[0], :]
     V2 = X[V1.shape[0]:, :]
     
     #print(X)
      
     #Extend arrays with zeros to make them the same size
     #if V1.shape[0] < V2.shape[0]: 
     #    V1 = Util.extendArray(V1, V2.shape, numpy.min(V1))
     #elif V2.shape[0] < V1.shape[0]: 
     #    V2 = Util.extendArray(V2, V1.shape, numpy.min(V2))
       
     #Let's compute C as the distance between vertices 
     #Distance is bounded by 1
     D = Util.distanceMatrix(V1, V2)
     maxD = numpy.max(D)
     minD = numpy.min(D)
     if (maxD-minD) != 0: 
         C = (maxD - D)/(maxD-minD)
     else: 
         C = numpy.ones((V1.shape[0], V2.shape[0])) 
         
     return C
    def cluster(self, graph):
        """
        Take a graph and cluster using the method in "On spectral clusering: analysis
        and algorithm" by Ng et al., 2001. 

        :param graph: the graph to cluster
        :type graph: :class:`apgl.graph.AbstractMatrixGraph`

        :returns:  An array of size graph.getNumVertices() of cluster membership 
        """
        L = graph.normalisedLaplacianSym()

        omega, Q = numpy.linalg.eig(L)
        inds = numpy.argsort(omega)

        #First normalise rows, then columns
        standardiser = Standardiser()
        V = standardiser.normaliseArray(Q[:, inds[0:self.k]].T).T
        V = vq.whiten(V)
        #Using kmeans2 here seems to result in a high variance
        #in the quality of clustering. Therefore stick to kmeans
        centroids, clusters = vq.kmeans(V, self.k, iter=self.numIterKmeans)
        clusters, distortion = vq.vq(V, centroids)

        return clusters
    def loadData():
        """
        Return the raw spectra and the MDS transformed data as well as the DataFrame
        for the MDS data. 
        """
        utilsLib = importr('utils')

        dataDir = PathDefaults.getDataDir() +  "metabolomic/"
        fileName = dataDir + "data.RMN.total.6.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        maxNMRIndex = 951
        X = df.rx(robjects.IntVector(range(1, maxNMRIndex)))
        X = numpy.array(X).T

        #Load age and normalise (missing values are assinged the mean) 
        ages = numpy.array(df.rx(robjects.StrVector(["Age"]))).ravel()
        meanAge = numpy.mean(ages[numpy.logical_not(numpy.isnan(ages))])
        ages[numpy.isnan(ages)] = meanAge
        ages = Standardiser().standardiseArray(ages)

        Xs = X.copy()
        standardiser = Standardiser()
        Xs = standardiser.standardiseArray(X)

        fileName = dataDir + "data.sportsmen.log.AP.1.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        maxNMRIndex = 419
        X2 = df.rx(robjects.IntVector(range(1, maxNMRIndex)))
        X2 = numpy.array(X2).T

        #Load the OPLS corrected files
        fileName = dataDir + "IGF1.log.OSC.1.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        minNMRIndex = 22
        maxNMRIndex = 441
        Xopls1 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex)))
        Xopls1 = numpy.array(Xopls1).T

        fileName = dataDir + "cort.log.OSC.1.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        minNMRIndex = 20
        maxNMRIndex = 439
        Xopls2 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex)))
        Xopls2 = numpy.array(Xopls2).T

        fileName = dataDir + "testo.log.OSC.1.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        minNMRIndex = 22
        maxNMRIndex = 441
        Xopls3 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex)))
        Xopls3 = numpy.array(Xopls3).T

        #Let's load all the label data here
        labelNames = MetabolomicsUtils.getLabelNames()
        YList = MetabolomicsUtils.createLabelList(df, labelNames)
        
        return X, X2, Xs, (Xopls1, Xopls2, Xopls3), YList, ages, df
Example #5
0
class DecisionTreeF(AbstractFunctionalPredictor):
    def __init__(self):
        super(DecisionTreeF, self).__init__()
        self.decisionTree = DecisionTree()

    def learnModel(self, X, y, folds=3):
        """
        Train using the given examples and labels, however first conduct grid
        search in conjunction with cross validation to find the best parameters.
        We also conduct filtering with a variety of values.
        """
        if self.waveletInds == None:
            self.waveletInds = numpy.arange(X.shape[1])

        nonWaveletInds = numpy.setdiff1d(numpy.arange(X.shape[1]),  self.waveletInds)

        Xw = X[:, self.waveletInds]
        Xo = X[:, nonWaveletInds]

        featureInds = numpy.flipud(numpy.argsort(numpy.sum(Xw**2, 0)))
        meanAUCs = numpy.zeros(self.candidatesN.shape[0])
        stdAUCs = numpy.zeros(self.candidatesN.shape[0])

        #Standardise the data
        Xw = Standardiser().standardiseArray(Xw)
        Xo = Standardiser().standardiseArray(Xo)

        for i in range(self.candidatesN.shape[0]):
            newX = numpy.c_[Xw[:, featureInds[0:self.candidatesN[i]]], Xo]
            meanAUCs[i], stdAUCs[i] = self.decisionTree.evaluateStratifiedCv(newX, y, folds, metricMethod=Evaluator.auc)

        bestI = numpy.argmax(meanAUCs)
        self.featureInds = numpy.r_[self.waveletInds[featureInds[0:self.candidatesN[bestI]]], nonWaveletInds]
        logging.debug("Best learner found: " + str(self.decisionTree) + " N:" + str(self.candidatesN[bestI]))

        self.standardiser = Standardiser()
        newX = self.standardiser.standardiseArray(X[:, self.featureInds])
        self.decisionTree.learnModel(newX, y)

    def predict(self, X):
        newX = self.standardiser.standardiseArray(X[:, self.featureInds])
        return self.decisionTree.predict(newX)

    @staticmethod
    def generate(waveletInds=None):
        """
        Generate a classifier which does a grid search.
        """
        def generatorFunc():
            decisionTree = DecisionTreeF()
            decisionTree.setWaveletInds(waveletInds)
            return decisionTree
        return generatorFunc

    def setWeight(self, weight):
        self.decisionTree.setWeight(weight)
Example #6
0
    def testUnstandardiseArray(self):
        numExamples = 10
        numFeatures = 3

        tol = 10**-6
        preprocessor = Standardiser()

        #Test an everyday matrix
        X = numpy.random.rand(numExamples, numFeatures)
        Xs = preprocessor.standardiseArray(X)
        X2 = preprocessor.unstandardiseArray(Xs)

        self.assertTrue(numpy.linalg.norm(X2 - X) < tol)
    def testLearnModel(self):
        numExamples = 50
        numFeatures = 200
        preprocessor = Standardiser()
        X = numpy.random.randn(numExamples, numFeatures)
        X = preprocessor.standardiseArray(X)
        c = numpy.random.rand(numFeatures)
        y = numpy.dot(X, c)

        tol = 0.05
        kernel = LinearKernel()
        lmbda = 0.0001
        predictor = KernelShiftRegression(kernel, lmbda)

        alpha, b = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        self.assertTrue(Evaluator.rootMeanSqError(y, predY) < tol)

        #Try increasing y
        y = y + 5
        lmbda = 0.2
        predictor = KernelShiftRegression(kernel, lmbda)
        alpha, b = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        self.assertTrue(numpy.abs(b - 5) < 0.1)
        self.assertTrue(Evaluator.rootMeanSqError(y, predY) < 0.1)

        #Try making prediction for multilabel Y
        C = numpy.random.rand(numFeatures, numFeatures)
        Y = numpy.dot(X, C)

        predictor = KernelShiftRegression(kernel, lmbda)
        alpha, b = predictor.learnModel(X, Y)
        predY = predictor.predict(X)

        self.assertTrue(Evaluator.rootMeanSqError(Y, predY) < 0.1)

        #Now, shift the data 
        s = numpy.random.rand(numFeatures)
        Y = Y + s

        predictor = KernelShiftRegression(kernel, lmbda)
        alpha, b = predictor.learnModel(X, Y)
        predY = predictor.predict(X)

        self.assertTrue(numpy.linalg.norm(b - s) < 0.1)
        self.assertTrue(Evaluator.rootMeanSqError(Y, predY) < 0.1)
Example #8
0
 def testStandardiseArray(self):
     numExamples = 10 
     numFeatures = 3 
     
     preprocessor = Standardiser()
     
     #Test an everyday matrix 
     X = numpy.random.rand(numExamples, numFeatures)
     Xs = preprocessor.standardiseArray(X)
     
     self.assertAlmostEquals(numpy.sum(Xs), 0, places=3)
     self.assertAlmostEquals(numpy.sum(Xs*Xs), numFeatures, places=3)
     
     #Now, test on a portion of a matrix 
     Xss = preprocessor.standardiseArray(X[1:5, :])
     self.assertTrue((Xss == Xs[1:5, :]).all())
Example #9
0
    def learnModel(self, X, y, folds=3):
        """
        Train using the given examples and labels, however first conduct grid
        search in conjunction with cross validation to find the best parameters.
        We also conduct filtering with a variety of values.
        """
        if self.waveletInds == None:
            self.waveletInds = numpy.arange(X.shape[1])

        nonWaveletInds = numpy.setdiff1d(numpy.arange(X.shape[1]),  self.waveletInds)

        Xw = X[:, self.waveletInds]
        Xo = X[:, nonWaveletInds]

        featureInds = numpy.flipud(numpy.argsort(numpy.sum(Xw**2, 0)))
        meanAUCs = numpy.zeros(self.candidatesN.shape[0])
        stdAUCs = numpy.zeros(self.candidatesN.shape[0])

        #Standardise the data
        Xw = Standardiser().standardiseArray(Xw)
        Xo = Standardiser().standardiseArray(Xo)

        for i in range(self.candidatesN.shape[0]):
            newX = numpy.c_[Xw[:, featureInds[0:self.candidatesN[i]]], Xo]
            meanAUCs[i], stdAUCs[i] = self.decisionTree.evaluateStratifiedCv(newX, y, folds, metricMethod=Evaluator.auc)

        bestI = numpy.argmax(meanAUCs)
        self.featureInds = numpy.r_[self.waveletInds[featureInds[0:self.candidatesN[bestI]]], nonWaveletInds]
        logging.debug("Best learner found: " + str(self.decisionTree) + " N:" + str(self.candidatesN[bestI]))

        self.standardiser = Standardiser()
        newX = self.standardiser.standardiseArray(X[:, self.featureInds])
        self.decisionTree.learnModel(newX, y)
Example #10
0
 def testCentreArray(self):
     numExamples = 10 
     numFeatures = 3 
     
     preprocessor = Standardiser()
     
     #Test an everyday matrix 
     X = numpy.random.rand(numExamples, numFeatures)
     Xc = preprocessor.centreArray(X)
     centreV = preprocessor.getCentreVector()
     self.assertAlmostEquals(numpy.sum(Xc), 0, places=3)
     self.assertTrue((X-centreV == Xc).all())
     
     #Now take out 3 rows of X, normalise and compare to normalised X 
     Xs = X[0:3, :]
     Xsc = preprocessor.centreArray(Xs)
     self.assertTrue((Xsc == Xc[0:3, :]).all())
Example #11
0
 def testNormaliseArray(self):
     numExamples = 10 
     numFeatures = 3 
     
     preprocessor = Standardiser()
     
     #Test an everyday matrix 
     X = numpy.random.rand(numExamples, numFeatures)
     Xn = preprocessor.normaliseArray(X)
     normV = preprocessor.getNormVector()
     self.assertAlmostEquals(numpy.sum(Xn*Xn), numFeatures, places=3)
     
     norms = numpy.sum(Xn*Xn, 0)
     
     for i in range(0, norms.shape[0]): 
         self.assertAlmostEquals(norms[i], 1, places=3)
         
     self.assertTrue((X/normV == Xn).all())
     
     #Zero one column 
     preprocessor = Standardiser()
     X[:, 1] = 0 
     Xn = preprocessor.normaliseArray(X)
     normV = preprocessor.getNormVector()
     self.assertAlmostEquals(numpy.sum(Xn*Xn), numFeatures-1, places=3)
     self.assertTrue((X/normV == Xn).all())
     
     #Now take out 3 rows of X, normalise and compare to normalised X 
     Xs = X[0:3, :]
     Xsn = preprocessor.normaliseArray(Xs)
     self.assertTrue((Xsn == Xn[0:3, :]).all())
    def testClassify(self):
        numExamples = 10
        numFeatures = 20

        X = numpy.random.randn(numExamples, numFeatures)
        y = numpy.sign(numpy.random.randn(numExamples))
        logging.debug(y)

        preprocessor = Standardiser()
        X = preprocessor.standardiseArray(X)

        tol = 10**-5
        lmbda = 1.0
        kernel = LinearKernel()

        predictor = KernelRidgeRegression(kernel, lmbda)
        predictor.learnModel(X, y)
        classY, predY = predictor.classify(X)

        self.assertTrue(numpy.logical_or(classY == 1, classY == -1).all() ) 
Example #13
0
    def learnModel(self, X, y, folds=3):
        """
        Train using the given examples and labels, however first conduct grid
        search in conjunction with cross validation to find the best parameters.
        We also conduct filtering with a variety of values. 
        """
        #Hard coding this is bad
        Cs = 2**numpy.arange(-2, 7, dtype=numpy.float)
        #Cs = numpy.array([0.1, 2.0])

        if self.waveletInds == None:
            self.waveletInds = numpy.arange(X.shape[1])

        nonWaveletInds = numpy.setdiff1d(numpy.arange(X.shape[1]),  self.waveletInds)

        Xw = X[:, self.waveletInds]
        Xo = X[:, nonWaveletInds]

        featureInds = numpy.flipud(numpy.argsort(numpy.sum(Xw**2, 0)))
        meanAUCs = numpy.zeros((Cs.shape[0], self.candidatesN.shape[0]))
        stdAUCs = numpy.zeros((Cs.shape[0], self.candidatesN.shape[0]))

        #Standardise the data
        Xw = Standardiser().standardiseArray(Xw)
        Xo = Standardiser().standardiseArray(Xo)

        for i in range(Cs.shape[0]):
            for j in range(self.candidatesN.shape[0]):
                self.linearSVM.setC(Cs[i])
                newX = numpy.c_[Xw[:, featureInds[0:self.candidatesN[j]]], Xo]
                meanAUCs[i, j], stdAUCs[i, j] = self.linearSVM.evaluateStratifiedCv(newX, y, folds, metricMethod=Evaluator.auc)

        (bestI, bestJ) = numpy.unravel_index(numpy.argmax(meanAUCs), meanAUCs.shape)
        self.linearSVM.setC(Cs[bestI])
        self.featureInds = numpy.r_[self.waveletInds[featureInds[0:self.candidatesN[bestJ]]], nonWaveletInds]
        logging.debug("Best learner found: " + str(self.linearSVM) + " N:" + str(self.candidatesN[bestJ]))

        self.standardiser = Standardiser()
        newX = self.standardiser.standardiseArray(X[:, self.featureInds])
        self.linearSVM.learnModel(newX, y)
    def testLearnModel2(self):
        numExamples = 200
        numFeatures = 100

        X = numpy.random.randn(numExamples, numFeatures)
        y = numpy.random.randn(numExamples)

        preprocessor = Standardiser()
        X = preprocessor.standardiseArray(X)

        tol = 10**-3
        kernel = LinearKernel()

        #Try using a low-rank matrix 
        lmbda = 0.001
        predictor = KernelShiftRegression(kernel, lmbda)

        alpha, b = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        logging.debug((numpy.linalg.norm(y)))
        logging.debug((numpy.linalg.norm(predY - y)))
    def __init__(self, graph, predictor):
        """
        Create the class by reading a graph with labelled edges. Instantiate the predictor
        and create a preprocesor to standarise examples to have zero mean and unit variance.
        """
        self.graph = graph
        self.predictor = predictor
        self.errorMethod = Evaluator.balancedError

        #Note: We modify the vertices of the input graph!!!!
        logging.warn("About to modify (normalise) the vertices of the graph.")
        self.preprocessor = Standardiser()
        V = graph.getVertexList().getVertices(graph.getAllVertexIds())
        V = self.preprocessor.normaliseArray(V)
        graph.getVertexList().setVertices(V)
Example #16
0
    def learnModel(self, X, y, folds=3):
        """
        Train using the given examples and labels, however first conduct grid
        search in conjunction with cross validation to find the best parameters.
        We also conduct filtering with a variety of values.
        """
        #Hard coding this is bad
        Cs = 2**numpy.arange(-2, 6, dtype=numpy.float)
        gammas = 2**numpy.arange(-5, 0, dtype=numpy.float)

        if self.waveletInds == None:
            self.waveletInds = numpy.arange(X.shape[1])

        nonWaveletInds = numpy.setdiff1d(numpy.arange(X.shape[1]),  self.waveletInds)

        Xw = X[:, self.waveletInds]
        Xo = X[:, nonWaveletInds]

        featureInds = numpy.flipud(numpy.argsort(numpy.sum(Xw**2, 0)))
        meanAUCs = numpy.zeros((Cs.shape[0], gammas.shape[0], self.candidatesN.shape[0]))
        stdAUCs = numpy.zeros((Cs.shape[0], gammas.shape[0], self.candidatesN.shape[0]))

        #Standardise the data
        Xw = Standardiser().standardiseArray(Xw)
        Xo = Standardiser().standardiseArray(Xo)

        for i in range(Cs.shape[0]):
            for j in range(gammas.shape[0]): 
                for k in range(self.candidatesN.shape[0]):
                    self.SVC.setC(Cs[i])
                    self.SVC.setGamma(gammas[j])
                    newX = numpy.c_[Xw[:, featureInds[0:self.candidatesN[k]]], Xo]
                    meanAUCs[i, j, k], stdAUCs[i, j, k] = self.SVC.evaluateStratifiedCv(newX, y, folds, metricMethod=Evaluator.auc)

        (bestI, bestJ, bestK) = numpy.unravel_index(numpy.argmax(meanAUCs), meanAUCs.shape)
        self.SVC.setC(Cs[bestI])
        self.SVC.setGamma(gammas[bestJ])
        self.featureInds = numpy.r_[self.waveletInds[featureInds[0:self.candidatesN[bestK]]], nonWaveletInds]
        logging.debug("Best learner found: " + str(self.SVC) + " N:" + str(self.candidatesN[bestK]))

        self.standardiser = Standardiser()
        newX = self.standardiser.standardiseArray(X[:, self.featureInds])
        self.SVC.learnModel(newX, y)
Example #17
0
    def __init__(self, examplesFileName):
        """
        Create the class by reading examples from a Matlab file. Instantiate the SVM
        and create a preprocesor to standarise examples to have zero mean and unit variance. 
        """
        self.examplesList = ExamplesList.readFromFile(examplesFileName)
        self.examplesList.setDefaultExamplesName("X")
        self.examplesList.setLabelsName("y")

        (freqs, items) = Util.histogram(self.examplesList.getSampledDataField("y").ravel())
        logging.info("Distribution of labels: " + str((freqs, items)))
        logging.info("The base error rate is " + str(float(min(freqs))/self.examplesList.getNumExamples()))
        
        self.classifier = LibSVM()
        self.errorMethod = Evaluator.balancedError

        self.preprocessor = Standardiser()
        X = self.preprocessor.standardiseArray(self.examplesList.getDataField(self.examplesList.getDefaultExamplesName()))
        self.examplesList.overwriteDataField(self.examplesList.getDefaultExamplesName(), X)
Example #18
0
class SvmEgoSimulator(AbstractDiffusionSimulator):
    """
    A class which combines SVM classification with the EgoSimulation. There are methods
    to run modelSelection, train the SVM and then run the simulation. The simulation itself
    is run using EgoSimulator. 
    """
    def __init__(self, examplesFileName):
        """
        Create the class by reading examples from a Matlab file. Instantiate the SVM
        and create a preprocesor to standarise examples to have zero mean and unit variance. 
        """
        self.examplesList = ExamplesList.readFromFile(examplesFileName)
        self.examplesList.setDefaultExamplesName("X")
        self.examplesList.setLabelsName("y")

        (freqs, items) = Util.histogram(self.examplesList.getSampledDataField("y").ravel())
        logging.info("Distribution of labels: " + str((freqs, items)))
        logging.info("The base error rate is " + str(float(min(freqs))/self.examplesList.getNumExamples()))
        
        self.classifier = LibSVM()
        self.errorMethod = Evaluator.balancedError

        self.preprocessor = Standardiser()
        X = self.preprocessor.standardiseArray(self.examplesList.getDataField(self.examplesList.getDefaultExamplesName()))
        self.examplesList.overwriteDataField(self.examplesList.getDefaultExamplesName(), X)

    def getPreprocessor(self):
        """
        Returns the preprocessor
        """
        return self.preprocessor

    def sampleExamples(self, sampleSize):
        """
        This function exists so that we can sample the same examples used in model
        selection and exclude them when running evaluateClassifier. 
        """
        self.examplesList.randomSubData(sampleSize)

    def modelSelection(self, Cs, kernel, kernelParams, errorCosts, folds, sampleSize):
        """
        Perform model selection using an SVM
        """
        Parameter.checkInt(sampleSize, 0, self.examplesList.getNumExamples())
        Parameter.checkInt(folds, 0, sampleSize)
        Parameter.checkString(kernel, ["linear", "polynomial", "gaussian"])
        Parameter.checkList(Cs, Parameter.checkFloat, [0.0, float("inf")])
        Parameter.checkList(errorCosts, Parameter.checkFloat, [0.0, float("inf")])

        #Perform model selection
        self.examplesList.randomSubData(sampleSize)
        (freqs, items) = Util.histogram(self.examplesList.getSampledDataField("y").ravel())
        logging.info("Using "  + str(sampleSize) + " examples for model selection")
        logging.info("Distribution of labels: " + str((freqs, items)))
        logging.info("List of Cs " + str(Cs))
        logging.info("List of kernels " + str(kernel))
        logging.info("List of kernelParams " + str(kernelParams))
        logging.info("List of errorCosts " + str(errorCosts))

        CVal, kernelParamVal, errorCost, error = self.classifier.cvModelSelection(self.examplesList, Cs, kernelParams, kernel, folds, errorCosts, self.errorMethod)
        logging.info("Model selection returned C = " + str(CVal) + " kernelParam = " + str(kernelParamVal) + " errorCost = " + str(errorCost)  + " with error " + str(error))
        return CVal, kernelParamVal, errorCost, error

    def evaluateClassifier(self, CVal, kernel, kernelParamVal, errorCost, folds, sampleSize, invert=True):
        """
        Evaluate the SVM with the given parameters. Often model selection is done before this step
        and in that case, invert=True uses a sample excluding those used for model selection. 
        """
        Parameter.checkFloat(CVal, 0.0, float('inf'))
        Parameter.checkFloat(errorCost, 0.0, float('inf'))
        Parameter.checkString(kernel, ["linear", "polynomial", "gaussian"])
        
        if kernel == "gaussian":
            Parameter.checkFloat(kernelParamVal, 0.0, float('inf'))
        elif kernel == "polynomial":
            Parameter.checkInt(kernelParamVal, 2, float('inf'))

        Parameter.checkInt(folds, 0, sampleSize)
        Parameter.checkInt(sampleSize, 0, self.examplesList.getNumExamples())

        if invert:
            allIndices = numpy.array(list(range(0, self.examplesList.getNumExamples())))
            testIndices = numpy.setdiff1d(allIndices, self.examplesList.getPermutationIndices())
            testIndices = numpy.random.permutation(testIndices)[0:sampleSize]
        else:
            testIndices = Util.sampleWithoutReplacement(sampleSize, self.examplesList.getNumExamples())

        logging.info("Using " + str(testIndices.shape[0]) + " examples for SVM evaluation")

        self.examplesList.setPermutationIndices(testIndices)
        self.classifier.setParams(C=CVal, kernel=kernel, kernelParam=kernelParamVal)
        self.classifier.setErrorCost(errorCost)
        
        (means, vars) = self.classifier.evaluateCv(self.examplesList, folds)

        logging.info("--- Classification evaluation ---")
        logging.info("Error on " + str(testIndices.shape[0]) + " examples is " + str(means[0]) + "(" + str(vars[0]) + ")")
        logging.info("Sensitivity (recall = TP/(TP+FN)): " + str(means[1])  + "(" + str(vars[1]) + ")")
        logging.info("Specificity (TN/TN+FP): "  + str(means[2])  + "(" + str(vars[2]) + ")")
        logging.info("Error on positives: "  + str(means[3])  + "(" + str(vars[3]) + ")")
        logging.info("Error on negatives: "  + str(means[4])  + "(" + str(vars[4]) + ")")
        logging.info("Balanced error: "  + str(means[5])  + "(" + str(vars[5]) + ")")

        return (means, vars)

    def trainClassifier(self, CVal, kernel, kernelParamVal, errorCost, sampleSize):
        Parameter.checkFloat(CVal, 0.0, float('inf'))
        Parameter.checkString(kernel, ["linear", "gaussian", "polynomial"])
        Parameter.checkFloat(kernelParamVal, 0.0, float('inf'))
        Parameter.checkFloat(errorCost, 0.0, float('inf'))
        Parameter.checkInt(sampleSize, 0, self.examplesList.getNumExamples())

        logging.info("Training SVM with C=" + str(CVal) + ", " + kernel + " kernel" + ", param=" + str(kernelParamVal) + ", sampleSize=" + str(sampleSize) + ", errorCost=" + str(errorCost))

        self.examplesList.randomSubData(sampleSize)
        self.classifier.setC(C=CVal)
        self.classifier.setKernel(kernel=kernel, kernelParam=kernelParamVal)
        self.classifier.setErrorCost(errorCost)

        X = self.examplesList.getSampledDataField(self.examplesList.getDefaultExamplesName())
        y = self.examplesList.getSampledDataField(self.examplesList.getLabelsName())
        y = y.ravel()
        self.classifier.learnModel(X, y)

        return self.classifier

    def getWeights(self):
        return self.classifier.getWeights()


    def runSimulation(self, maxIterations):
        Parameter.checkInt(maxIterations, 1, float('inf'))

        #Notice that the data is preprocessed in the same way as the survey data
        egoSimulator = EgoSimulator(self.graph, self.classifier, self.preprocessor)

        totalInfo = numpy.zeros(maxIterations+1)
        totalInfo[0] = EgoUtils.getTotalInformation(self.graph)
        logging.info("Total number of people with information: " + str(totalInfo[0]))

        logging.info("--- Simulation Started ---")

        for i in range(0, maxIterations):
            logging.info("--- Iteration " + str(i) + " ---")

            self.graph = egoSimulator.advanceGraph()
            totalInfo[i+1] = EgoUtils.getTotalInformation(self.graph)
            logging.info("Total number of people with information: " + str(totalInfo[i+1]))

            #Compute distribution of ages etc. in alters
            alterIndices = egoSimulator.getAlters(i)
            alterAges = numpy.zeros(len(alterIndices))
            alterGenders = numpy.zeros(len(alterIndices))

            for j in range(0, len(alterIndices)):
                currentVertex = self.graph.getVertex(alterIndices[j])
                alterAges[j] = currentVertex[self.egoQuestionIds.index(("Q5X", 0))]
                alterGenders[j] = currentVertex[self.egoQuestionIds.index(("Q4", 0))]

            (freqs, items) = Util.histogram(alterAges)
            logging.info("Distribution of ages " + str(freqs) + " " + str(items))
            (freqs, items) = Util.histogram(alterGenders)
            logging.info("Distribution of genders " + str(freqs) + " " + str(items))
            
        logging.info("--- Simulation Finished ---")

        return totalInfo, egoSimulator.getTransmissions()

    def getVertexFeatureDistribution(self, fIndex, vIndices=None):
        return self.graph.getVertexFeatureDistribution(fIndex, vIndices)

    def getPreProcessor(self):
        return self.preprocessor

    def getClassifier(self):
        return self.classifier 

    preprocessor = None
    examplesList = None
    classifier = None
    graph = None
    edgeWeight = 1 
    def testLearnModel(self):
        numExamples = 50
        numFeatures = 200

        X = numpy.random.randn(numExamples, numFeatures)
        y = numpy.random.randn(numExamples)

        preprocessor = Standardiser()
        X = preprocessor.standardiseArray(X)

        tol = 10**-3
        kernel = LinearKernel()

        #Compare Linear kernel with linear ridge regression 
        lmbda = 0.1
        predictor = KernelRidgeRegression(kernel, lmbda)

        alpha = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        K = numpy.dot(X, X.T)
        alpha2 = numpy.dot(numpy.linalg.inv(K+lmbda*numpy.eye(numExamples)), y)
        predY2 = X.dot(numpy.linalg.inv(numpy.dot(X.T, X) + lmbda*numpy.eye(numFeatures))).dot(X.T).dot(y)



        #logging.debug(numpy.linalg.norm(alpha - alpha2))

        self.assertTrue(numpy.linalg.norm(alpha - alpha2) < tol)
        self.assertTrue(numpy.linalg.norm(predY - predY2) < tol)

        lmbda = 0.5
        predictor = KernelRidgeRegression(kernel, lmbda)

        alpha = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        K = numpy.dot(X, X.T)
        alpha2 = numpy.dot(numpy.linalg.inv(K+lmbda*numpy.eye(numExamples)), y)
        predY2 = X.dot(numpy.linalg.inv(numpy.dot(X.T, X) + lmbda*numpy.eye(numFeatures))).dot(X.T).dot(y)

        self.assertTrue(numpy.linalg.norm(alpha - alpha2) < tol)
        self.assertTrue(numpy.linalg.norm(predY - predY2) < tol)

        #Now test on an alternative test set
        numTestExamples = 50
        testX = numpy.random.randn(numTestExamples, numFeatures)
        predictor = KernelRidgeRegression(kernel, lmbda)

        alpha = predictor.learnModel(X, y)
        predY = predictor.predict(testX)

        K = numpy.dot(X, X.T)
        alpha2 = numpy.dot(numpy.linalg.inv(K+lmbda*numpy.eye(numExamples)), y)
        predY2 = testX.dot(numpy.linalg.inv(numpy.dot(X.T, X) + lmbda*numpy.eye(numFeatures))).dot(X.T).dot(y)

        self.assertTrue(numpy.linalg.norm(alpha - alpha2) < tol)
        self.assertTrue(numpy.linalg.norm(predY - predY2) < tol)

        #Use the method against a multi-label example
        Y = numpy.random.randn(numExamples, numFeatures)

        alpha = predictor.learnModel(X, Y)

        self.assertTrue(alpha.shape == (numExamples, numFeatures))
Example #20
0
    def testAdvanceGraph3(self):
        """ 
        This test will learn from a set of ego and alter pairs, then we will make predictions on 
        the pairs and see the results. The we test if the same results are present in a simulation.  
        """
        dataDir = PathDefaults.getDataDir() + "infoDiffusion/"
        matFileName = dataDir +  "EgoAlterTransmissions1000.mat"
        examplesList = ExamplesList.readFromMatFile(matFileName)
        examplesList.setDefaultExamplesName("X")
        examplesList.setLabelsName("y")
        
        logging.debug(("Number of y = +1: " + str(sum(examplesList.getSampledDataField("y") == 1))))
        logging.debug(("Number of y = -1: " + str(sum(examplesList.getSampledDataField("y") == -1))))
        
        #Standardise the examples 
        preprocessor = Standardiser()
        X = examplesList.getDataField(examplesList.getDefaultExamplesName())
        X = preprocessor.standardiseArray(X)
        examplesList.overwriteDataField(examplesList.getDefaultExamplesName(), X)
        
        classifier = MlpySVM(kernel='linear', kp=1, C=32.0)

        y = examplesList.getDataField("y")
        classifier.learnModel(X, y)
        predY = classifier.classify(X)
        logging.debug(("Number of y = +1: " + str(sum(examplesList.getSampledDataField("y") == 1))))
        logging.debug(("Number of y = -1: " + str(sum(examplesList.getSampledDataField("y") == -1))))

        sampledY = examplesList.getSampledDataField(examplesList.getLabelsName()).ravel()

        error = mlpy.err(sampledY, predY)
        sensitivity = mlpy.sens(sampledY, predY)
        specificity = mlpy.spec(sampledY, predY)
        errorP = mlpy.errp(sampledY, predY)
        errorN = mlpy.errn(sampledY, predY)
        
        logging.debug("--- Classification evaluation ---")
        logging.debug(("Error on " + str(examplesList.getNumExamples()) + " examples is " + str(error)))
        logging.debug(("Sensitivity (recall = TP/(TP+FN)): " + str(sensitivity)))
        logging.debug(("Specificity (TN/TN+FP): "  + str(specificity)))
        logging.debug(("Error on positives: "  + str(errorP)))
        logging.debug(("Error on negatives: "  + str(errorN)))
        
        sGraph = EgoUtils.graphFromMatFile(matFileName)

        #Notice that the data is preprocessed in the same way as the survey data 
        egoSimulator = EgoSimulator(sGraph, classifier, preprocessor)
        
        totalInfo = EgoUtils.getTotalInformation(sGraph)
        logging.debug(("Total number of people with information: " + str(totalInfo)))
        self.assertEquals(totalInfo, 1000)
        
        sGraph = egoSimulator.advanceGraph()
        
        totalInfo = EgoUtils.getTotalInformation(sGraph)
        logging.debug(("Total number of people with information: " + str(totalInfo)))
        self.assertEquals(totalInfo, 1000 + sum(predY == 1))
        
        altersList = egoSimulator.getAlters(0)
        predictedAlters = numpy.nonzero(predY == 1)[0]
        
        self.assertTrue((altersList == predictedAlters*2+1).all())
    def clusterFromIterator(self, graphListIterator, verbose=False):
        """
        Find a set of clusters for the graphs given by the iterator. If verbose 
        is true the each iteration is timed and bounded the results are returned 
        as lists.
        
        The difference between a weight matrix and the previous one should be
        positive.
        """
        clustersList = []
        decompositionTimeList = [] 
        kMeansTimeList = [] 
        boundList = []
        i = 0

        for subW in graphListIterator:
            if __debug__:
                Parameter.checkSymmetric(subW)

            if self.logStep and i % self.logStep == 0:
                logging.debug("Graph index: " + str(i))
            logging.debug("Clustering graph of size " + str(subW.shape))
            if self.alg!="efficientNystrom": 
                ABBA = GraphUtils.shiftLaplacian(subW)

            # --- Eigen value decomposition ---
            startTime = time.time()
            if self.alg=="IASC": 
                if i % self.T != 0:
                    omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q)   
                    
                    if self.computeBound:
                        inds = numpy.flipud(numpy.argsort(omega))
                        Q = Q[:, inds]
                        omega = omega[inds]
                        bounds = self.pertBound(omega, Q, omegaKbot, AKbot, self.k2)
                        #boundList.append([i, bounds[0], bounds[1]])
                        
                        #Now use accurate values of norm of R and delta   
                        rank = Util.rank(ABBA.todense())
                        gamma, U = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0])
                        #logging.debug("gamma=" + str(gamma))
                        bounds2 = self.realBound(omega, Q, gamma, AKbot, self.k2)                  
                        boundList.append([i, bounds[0], bounds[1], bounds2[0], bounds2[1]])      
                else: 
                    logging.debug("Computing exact eigenvectors")
                    self.storeInformation(subW, ABBA)

                    if self.computeBound: 
                        #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0]))
                        rank = Util.rank(ABBA.todense())
                        omega, Q = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0])
                        inds = numpy.flipud(numpy.argsort(omega))
                        omegaKbot = omega[inds[self.k2:]]  
                        QKbot = Q[:, inds[self.k2:]] 
                        AKbot = (QKbot*omegaKbot).dot(QKbot.T)
                        
                        omegaSort = numpy.flipud(numpy.sort(omega))
                    else: 
                        omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0]))
                            
            elif self.alg == "nystrom":
                omega, Q = Nystrom.eigpsd(ABBA, self.k3)
            elif self.alg == "exact": 
                omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k1, ABBA.shape[0]-1), which="LM", ncv = min(15*self.k1, ABBA.shape[0]))
            elif self.alg == "efficientNystrom":
                omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1)
            elif self.alg == "randomisedSvd": 
                Q, omega, R = RandomisedSVD.svd(ABBA, self.k4)
            else:
                raise ValueError("Invalid Algorithm: " + str(self.alg))

            decompositionTimeList.append(time.time()-startTime)                  
                  
            if self.alg=="IASC":
                self.storeInformation(subW, ABBA)
            
            # --- Kmeans ---
            startTime = time.time()
            inds = numpy.flipud(numpy.argsort(omega))

            standardiser = Standardiser()
            #For some very strange reason we get an overflow when computing the
            #norm of the rows of Q even though its elements are bounded by 1.
            #We'll ignore it for now
            try:
                V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T
            except FloatingPointError as e:
                logging.warn("FloatingPointError: " + str(e))
            V = VqUtils.whiten(V)
            if i == 0:
                centroids, distortion = vq.kmeans(V, self.k1, iter=self.nb_iter_kmeans)
            else:
                centroids = self.findCentroids(V, clusters[:subW.shape[0]])
                if centroids.shape[0] < self.k1:
                    nb_missing_centroids = self.k1 - centroids.shape[0]
                    random_centroids = V[numpy.random.randint(0, V.shape[0], nb_missing_centroids),:]
                    centroids = numpy.vstack((centroids, random_centroids))
                centroids, distortion = vq.kmeans(V, centroids) #iter can only be 1
            clusters, distortion = vq.vq(V, centroids)
            kMeansTimeList.append(time.time()-startTime)

            clustersList.append(clusters)

            #logging.debug("subW.shape: " + str(subW.shape))
            #logging.debug("len(clusters): " + str(len(clusters)))
            #from apgl.util.ProfileUtils import ProfileUtils
            #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB")
            if ProfileUtils.memory() > 10**9:
                ProfileUtils.memDisplay(locals())

            i += 1

        if verbose:
            return clustersList, numpy.array((decompositionTimeList, kMeansTimeList)).T, boundList
        else:
            return clustersList
class EgoNetworkSimulator(AbstractDiffusionSimulator):
    """
    A class which combines Ego network prediction with simulating information transmission
    within a simulated social network.
    """
    def __init__(self, graph, predictor):
        """
        Create the class by reading a graph with labelled edges. Instantiate the predictor
        and create a preprocesor to standarise examples to have zero mean and unit variance.
        """
        self.graph = graph
        self.predictor = predictor
        self.errorMethod = Evaluator.balancedError

        #Note: We modify the vertices of the input graph!!!!
        logging.warn("About to modify (normalise) the vertices of the graph.")
        self.preprocessor = Standardiser()
        V = graph.getVertexList().getVertices(graph.getAllVertexIds())
        V = self.preprocessor.normaliseArray(V)
        graph.getVertexList().setVertices(V)

    def getPreprocessor(self):
        """
        Returns the preprocessor
        """
        return self.preprocessor

    def sampleEdges(self, sampleSize):
        """
        This function exists so that we can sample the same examples used in model
        selection and exclude them when running evaluateClassifier.
        """
        edges = self.graph.getAllEdges()
        trainInds = numpy.random.permutation(edges.shape[0])[0:sampleSize]
        trainEdges = edges[trainInds, :]

        trainGraph = SparseGraph(self.graph.getVertexList(), self.graph.isUndirected())
        trainGraph.addEdges(trainEdges, self.graph.getEdgeValues(trainEdges))

        logging.info("Randomly sampled " + str(sampleSize) + " edges")

        return trainGraph

    def modelSelection(self, paramList, paramFunc, folds, errorFunc, sampleSize):
        """
        Perform model selection using an edge label predictor. 
        """
        Parameter.checkInt(folds, 0, sampleSize)
        Parameter.checkInt(sampleSize, 0, self.graph.getNumEdges()) 

        #trainGraph = self.sampleEdges(sampleSize)
        trainGraph = self.graph

        #Perform model selection
        meanErrs, stdErrs = self.predictor.cvModelSelection(trainGraph, paramList, paramFunc, folds, errorFunc)
        logging.info("Model selection errors:" + str(meanErrs))
        logging.info("Model selection stds:" + str(stdErrs))
        logging.info("Model selection best parameters:" + str(paramList[numpy.argmin(meanErrs)]))

        return paramList[numpy.argmin(meanErrs)], paramFunc, meanErrs[numpy.argmin(meanErrs)] 

    def evaluateClassifier(self, params, paramFuncs, folds, errorFunc, sampleSize, invert=True):
        """
        Evaluate the predictor with the given parameters. Often model selection is done before this step
        and in that case, invert=True uses a sample excluding those used for model selection.

        Return a set of errors for each
        """
        Parameter.checkInt(folds, 0, sampleSize)
        Parameter.checkInt(sampleSize, 0, self.graph.getNumEdges())

        trainGraph = self.sampleEdges(sampleSize)

        return self.predictor.cvError(trainGraph, params, paramFuncs, folds, errorFunc)

    def trainClassifier(self, params, paramFuncs, sampleSize):
        
        for j in range(len(params)):
            paramFuncs[j](params[j])

        trainGraph = self.sampleEdges(sampleSize)
        self.predictor.learnModel(trainGraph)

        return self.predictor

    def runSimulation(self, maxIterations):
        Parameter.checkInt(maxIterations, 1, float('inf'))

        #Notice that the data is preprocessed in the same way as the survey data
        egoSimulator = EgoSimulator(self.graph, self.predictor, self.preprocessor)

        totalInfo = numpy.zeros(maxIterations+1)
        totalInfo[0] = EgoUtils.getTotalInformation(self.graph)
        logging.info("Total number of people with information: " + str(totalInfo[0]))

        logging.info("--- Simulation Started ---")

        for i in range(0, maxIterations):
            logging.info("--- Iteration " + str(i) + " ---")

            self.graph = egoSimulator.advanceGraph()
            totalInfo[i+1] = EgoUtils.getTotalInformation(self.graph)
            logging.info("Total number of people with information: " + str(totalInfo[i+1]))

            #Compute distribution of ages etc. in alters
            alterIndices = egoSimulator.getAlters(i)
            alterAges = numpy.zeros(len(alterIndices))
            alterGenders = numpy.zeros(len(alterIndices))

            for j in range(0, len(alterIndices)):
                currentVertex = self.graph.getVertex(alterIndices[j])
                alterAges[j] = currentVertex[self.egoQuestionIds.index(("Q5X", 0))]
                alterGenders[j] = currentVertex[self.egoQuestionIds.index(("Q4", 0))]

            (freqs, items) = Util.histogram(alterAges)
            logging.info("Distribution of ages " + str(freqs) + " " + str(items))
            (freqs, items) = Util.histogram(alterGenders)
            logging.info("Distribution of genders " + str(freqs) + " " + str(items))

        logging.info("--- Simulation Finished ---")

        return totalInfo, egoSimulator.getTransmissions()

    def getVertexFeatureDistribution(self, fIndex, vIndices=None):
        return self.graph.getVertexFeatureDistribution(fIndex, vIndices)

    def getPreProcessor(self):
        return self.preprocessor

    def getClassifier(self):
        return self.predictor

    preprocessor = None
    examplesList = None
    predictor = None
    graph = None
    edgeWeight = 1
Example #23
0
X = vectoriser.fit_transform(documentList)

print(vectoriser.get_feature_names()) 

corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

id2WordDict = dict(zip(range(len(vectoriser.get_feature_names())), vectoriser.get_feature_names()))   

k = 10
logging.getLogger('gensim').setLevel(logging.INFO)
lda = LdaModel(corpus, num_topics=k, id2word=id2WordDict, chunksize=1000, distributed=False) 
index = gensim.similarities.docsim.SparseMatrixSimilarity(lda[corpus], num_features=k)          

newX = vectoriser.transform(["graph"])
newX = [(i, newX[0, i])for i in newX.nonzero()[1]]
result = lda[newX]             
similarities = index[result]
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
print(similarities)

#Compute Helliger distance 
result = [i[1] for i in result]
newX = scipy.sparse.csc_matrix(result)
distances = SparseUtils.hellingerDistances(index.index, newX)
print(1 - distances)

#Try cosine metric 
X = Standardiser().normaliseArray(numpy.array(index.index.todense()).T).T
newX = numpy.array(newX.todense())
similarities = X.dot(newX.T).flatten()
print(similarities)
Example #24
0
numExamples = 100
numFeatures = 3
std = 0.1

V = numpy.random.rand(numExamples, numFeatures)
V[0:20, :] = numpy.random.randn(20, numFeatures) * std
V[0:20, 0:3] += numpy.array([1, 0.2, -1])

V[20:70, :] = numpy.random.randn(50, numFeatures) * std
V[20:70, 0:3] += numpy.array([-0.5, 1, -1])

V[70:, :] = numpy.random.randn(30, numFeatures) * std
V[70:, 0:3] += numpy.array([-0.3, 0.4, -0.1])

U = V - numpy.mean(V, 0)
U = Standardiser().normaliseArray(U.T).T

fig = plt.figure(0)
ax = fig.add_subplot(111, projection='3d')
ax.scatter(U[0:20, 0], U[0:20, 1], U[0:20, 2], c="red")
ax.scatter(U[20:70, 0], U[20:70, 1], U[20:70, 2], c="blue")
ax.scatter(U[70:, 0], U[70:, 1], U[70:, 2], c="green")

UU = U.dot(U.T)
#s, X = numpy.linalg.eig(UU)
X, a, Y = numpy.linalg.svd(U)

#Now compute true cluster error
k = 3
kmeans = sklearn.cluster.KMeans(k)
kmeans.fit(U)
defaultTol = 10**-5
allEdges = decayGraph.getAllEdges()
edgeDecays = decayGraph.getEdgeValues(allEdges)
#-1 is no transmission, +1 is transmission 
binaryEdges = numpy.array(edgeDecays > defaultTol, numpy.int)*2 -1

logging.info("Total number of transmisions: " + str(numpy.sum(binaryEdges[binaryEdges==1])))

#Center the decays values
logging.warn("Centering the edge decays for transmissions.")
meanDecay = numpy.mean(edgeDecays[binaryEdges==1]) 
logging.warn("Mean edge value :" + str(meanDecay))
edgeDecays[binaryEdges==1] = edgeDecays[binaryEdges==1] - numpy.mean(edgeDecays[binaryEdges==1])

logging.warn("About to modify (standardise) the vertices of the graph.")
preprocessor = Standardiser()
V = decayGraph.getVertexList().getVertices(decayGraph.getAllVertexIds())
V = preprocessor.standardiseArray(V)
decayGraph.getVertexList().setVertices(V)

#Take a subgraph
#edgeSampleSize = 20000
edgeSampleSize = 1000
transEdges = allEdges[0:edgeSampleSize, :]
transEdgeLabels = binaryEdges[0:edgeSampleSize]

transGraph = SparseGraph(decayGraph.getVertexList(), False)
transGraph.addEdges(transEdges, transEdgeLabels)
logging.info("Created graph of binary transmissions")
logging.info("Transmission graph: " + str(transGraph))
dataDir = PathDefaults.getDataDir() +  "metabolomic/"
X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData()

waveletStr = 'db4'
mode = "cpd"
maxLevel = 10
errors = numpy.zeros(maxLevel)
numFeatures = numpy.zeros(maxLevel)

level = 10 
waveletStrs = ["haar", "db4", "db8"]

#The variances are very similar across different wavelets 
for waveletStr in waveletStrs:
    Xw = MetabolomicsUtils.getWaveletFeatures(X, waveletStr, level, mode)
    standardiser = Standardiser()
    Xw = standardiser.centreArray(Xw)
    w, V = numpy.linalg.eig(Xw.dot(Xw.T))
    w = numpy.flipud(numpy.sort(w))

    variances = []
    variances.append(numpy.sum(w[0:1])/numpy.sum(w))
    variances.append(numpy.sum(w[0:5])/numpy.sum(w))
    variances.append(numpy.sum(w[0:10])/numpy.sum(w))
    variances.append(numpy.sum(w[0:15])/numpy.sum(w))
    variances.append(numpy.sum(w[0:20])/numpy.sum(w))
    variances.append(numpy.sum(w[0:25])/numpy.sum(w))
    variances.append(numpy.sum(w[0:50])/numpy.sum(w))
    variances.append(numpy.sum(w[0:100])/numpy.sum(w))
    variances.append(numpy.sum(w[0:150])/numpy.sum(w))
    variances.append(numpy.sum(w[0:200])/numpy.sum(w))
    def saveResults(self, labelIndex):
        """
        Compute the results and save them for a particular hormone. Does so for all
        leafranks
        """
        folds = 5
        if type(self.X) == numpy.ndarray:
            X = self.X[self.YList[labelIndex][1], :]
        else:
            X = self.X[labelIndex][self.YList[labelIndex][1], :]

        X = numpy.c_[X, self.ages[self.YList[labelIndex][1]]]
        Y = self.YList[labelIndex][0]
        numExamples = X.shape[0]

        logging.debug("Shape of examples: " + str(X.shape))

        standardiserX = Standardiser()
        X = standardiserX.standardiseArray(X)

        standardiserY = Standardiser()
        Y = standardiserY.standardiseArray(Y)

        #We need to include the ROC curves
        indexList = Sampling.crossValidation(folds, numExamples)
        splitFunction = lambda trainX, trainY: Sampling.crossValidation(folds, trainX.shape[0])
    
        #We need a metric to minimise 
        def invMeanAUC(predY, testY):
            return 1 - self.meanAUC(predY, testY, labelIndex, standardiserY)

        metricMethods = [invMeanAUC]

        #Now create a learnerIterator based on the SVM
        Cs = 2**numpy.arange(-8, 2, dtype=numpy.float)
        gammas = 2**numpy.arange(-10, 0, dtype=numpy.float)
        epsilons = 2**numpy.arange(-5, 0, dtype=numpy.float)

        fileName = self.resultsDir + self.labelNames[labelIndex] + "-svr_rbf-" + self.featuresName +  ".dat"
        learnerIterator = []

        for C in Cs:
            for gamma in gammas:
                for epsilon in epsilons:
                    learner = svm.SVR(C=C, gamma=gamma, epsilon=epsilon)
                    learner.learnModel = learner.fit
                    learnerIterator.append(learner)

        self.saveResult(X, Y, indexList, splitFunction, learnerIterator, metricMethods, fileName, labelIndex, standardiserY)

        #Try the polynomial SVM
        fileName = self.resultsDir + self.labelNames[labelIndex] + "-svr_poly-" + self.featuresName +  ".dat"
        degrees = numpy.array([2, 3])

        for C in Cs:
            for degree in degrees:
                for epsilon in epsilons:
                    learner = svm.SVR(kernel='poly', C=C, degree=degree, epsilon=epsilon)
                    learner.learnModel = learner.fit
                    learnerIterator.append(learner)

        self.saveResult(X, Y, indexList, splitFunction, learnerIterator, metricMethods, fileName, labelIndex, standardiserY)
            
        #Now try Lasso and ElasticNet
        fileName = self.resultsDir + self.labelNames[labelIndex] + "-lasso-" + self.featuresName +  ".dat"
        alphas = 2**numpy.arange(-9, 0, dtype=numpy.float)
        learnerIterator = []

        for alpha in alphas:
            learner = linear_model.Lasso(alpha = alpha)
            learner.learnModel = learner.fit
            learnerIterator.append(learner)

        self.saveResult(X, Y, indexList, splitFunction, learnerIterator, metricMethods, fileName, labelIndex, standardiserY)
Example #28
0
class LinearSvmFGs(AbstractFunctionalPredictor):
    def __init__(self):
        super(LinearSvmFGs, self).__init__()
        self.linearSVM = LinearSVM()

    def learnModel(self, X, y, folds=3):
        """
        Train using the given examples and labels, however first conduct grid
        search in conjunction with cross validation to find the best parameters.
        We also conduct filtering with a variety of values. 
        """
        #Hard coding this is bad
        Cs = 2**numpy.arange(-2, 7, dtype=numpy.float)
        #Cs = numpy.array([0.1, 2.0])

        if self.waveletInds == None:
            self.waveletInds = numpy.arange(X.shape[1])

        nonWaveletInds = numpy.setdiff1d(numpy.arange(X.shape[1]),  self.waveletInds)

        Xw = X[:, self.waveletInds]
        Xo = X[:, nonWaveletInds]

        featureInds = numpy.flipud(numpy.argsort(numpy.sum(Xw**2, 0)))
        meanAUCs = numpy.zeros((Cs.shape[0], self.candidatesN.shape[0]))
        stdAUCs = numpy.zeros((Cs.shape[0], self.candidatesN.shape[0]))

        #Standardise the data
        Xw = Standardiser().standardiseArray(Xw)
        Xo = Standardiser().standardiseArray(Xo)

        for i in range(Cs.shape[0]):
            for j in range(self.candidatesN.shape[0]):
                self.linearSVM.setC(Cs[i])
                newX = numpy.c_[Xw[:, featureInds[0:self.candidatesN[j]]], Xo]
                meanAUCs[i, j], stdAUCs[i, j] = self.linearSVM.evaluateStratifiedCv(newX, y, folds, metricMethod=Evaluator.auc)

        (bestI, bestJ) = numpy.unravel_index(numpy.argmax(meanAUCs), meanAUCs.shape)
        self.linearSVM.setC(Cs[bestI])
        self.featureInds = numpy.r_[self.waveletInds[featureInds[0:self.candidatesN[bestJ]]], nonWaveletInds]
        logging.debug("Best learner found: " + str(self.linearSVM) + " N:" + str(self.candidatesN[bestJ]))

        self.standardiser = Standardiser()
        newX = self.standardiser.standardiseArray(X[:, self.featureInds])
        self.linearSVM.learnModel(newX, y)
        
    def predict(self, X):
        newX = self.standardiser.standardiseArray(X[:, self.featureInds])
        return self.linearSVM.predict(newX)

    @staticmethod
    def generate(waveletInds=None):
        """
        Generate a classifier which does a grid search.
        """
        def generatorFunc():
            linearSvm = LinearSvmFGs()
            linearSvm.setWaveletInds(waveletInds)
            return linearSvm
        return generatorFunc

    def setWeight(self, weight):
        self.linearSVM.setWeight(weight)
Example #29
0
"""
Compare the clustering methods in scikits.learn to see which ones are fastest
and most accurate 
"""
import time
import numpy
import sklearn.cluster as cluster
from apgl.data.Standardiser import Standardiser
import scipy.cluster.vq as vq

numExamples = 10000
numFeatures = 500

X = numpy.random.rand(numExamples, numFeatures)
X = Standardiser().standardiseArray(X)

k = 10
numRuns = 10
maxIter = 100
tol = 10**-4

intialCentroids = X[0:k, :]

#Quite fast
print("Running scikits learn k means")
clusterer = cluster.KMeans(k=k,
                           n_init=numRuns,
                           tol=tol,
                           init=intialCentroids,
                           max_iter=maxIter)
start = time.clock()
Example #30
0
 def __init__(self, learningAlg, windowSize, preprocessor=Standardiser()):
     
     self.windowSize = windowSize
     self.learningAlg = learningAlg
     self.preprocessor = preprocessor 
     self.printStep = 50 
Example #31
0
numExamples = 100 
numFeatures = 3
std = 0.1 

V = numpy.random.rand(numExamples, numFeatures)
V[0:20 ,:] = numpy.random.randn(20, numFeatures)*std 
V[0:20 ,0:3] += numpy.array([1, 0.2, -1]) 

V[20:70 ,:] = numpy.random.randn(50, numFeatures)*std  
V[20:70, 0:3] += numpy.array([-0.5, 1, -1])

V[70: ,:] = numpy.random.randn(30, numFeatures)*std  
V[70:, 0:3] += numpy.array([-0.3, 0.4, -0.1])

U = V - numpy.mean(V, 0)
U = Standardiser().normaliseArray(U.T).T

fig = plt.figure(0)
ax = fig.add_subplot(111, projection='3d')
ax.scatter(U[0:20, 0], U[0:20, 1], U[0:20, 2], c="red")
ax.scatter(U[20:70, 0], U[20:70, 1], U[20:70, 2], c="blue")
ax.scatter(U[70:, 0], U[70:, 1], U[70:, 2], c="green")

UU = U.dot(U.T)
#s, X = numpy.linalg.eig(UU)
X, a, Y = numpy.linalg.svd(U)

#Now compute true cluster error 
k = 3
kmeans = sklearn.cluster.KMeans(k)
kmeans.fit(U)
Example #32
0
    def saveResults(self, leafRankGenerators, standardise=True):
        """
        Compute the results and save them for a particular hormone. Does so for all
        leafranks
        """
        j = 0
        nonNaInds = self.YList[j][1]
        hormoneInd = self.hormoneInds[j]

        k = 2
        if type(self.X) == numpy.ndarray:
            X = self.X[nonNaInds, :]
        else:
            X = self.X[j][nonNaInds, :]
        X = numpy.c_[X, self.ages[nonNaInds]]
        if standardise:
            X = Standardiser().standardiseArray(X)
        Y = hormoneInd[k]

        waveletInds = numpy.arange(X.shape[1]-1)

        logging.debug("Shape of examples: " + str(X.shape))
        logging.debug("Distribution of labels: " + str(numpy.bincount(Y)))

        #pca = decomp.PCA(n_components=40)
        #X = pca.fit_transform(X)
        #print(X.shape)

        #Go through all the leafRanks
        for i in range(len(leafRankGenerators)):
            #Compute TreeRankForest here
            fileName = self.resultsDir + "TreeRankForest-" + self.hormoneNames[j] + "_" + str(k) + "-" +  leafRankGenerators[i][1]  + "-" + self.featuresName +  ".dat"
            try:
                logging.debug("Computing file " + fileName)
                #treeRankForest = TreeRankForest(self.funcLeafRankGenerators[0][0](waveletInds))
                treeRankForest = TreeRankForest(self.leafRankGenerators[0][0])
                treeRankForest.setMaxDepth(10)
                treeRankForest.setNumTrees(5)
                #Setting this low definitely helps 
                #treeRankForest.setFeatureSize(1.0)
                treeRankForest.setFeatureSize(0.05)
                #The following 2 lines definitely improve stability and the AUC 
                treeRankForest.setSampleSize(1.0)
                #Setting this to true results in slightly worse results 
                treeRankForest.setSampleReplace(True)
                mean, var = treeRankForest.evaluateStratifiedCv(X, Y, self.folds, metricMethod=Evaluator.auc)
                print(mean)

                #treeRank = TreeRank(self.leafRankGenerators[0][0])
                #treeRank.setMaxDepth(self.maxDepth)
                #(bestParams, allMetrics, bestMetaDicts) = treeRank.evaluateCvOuter(X, Y, self.folds)
                #print(str(allMetrics))


                #Util.savePickle(cvResults, fileName)
            except:
                logging.debug("Caught an error in the code ... skipping")
                raise
            else:
                logging.debug("File exists: " + fileName)
        return
Example #33
0
#The set of edges indexed by zeros is the contact graph
#The ones indexed by 1 is the infection graph
edgeTypeIndex1 = 0
edgeTypeIndex2 = 1
sGraphContact = graph.getSparseGraph(edgeTypeIndex1)
sGraphInfect = graph.getSparseGraph(edgeTypeIndex2)

numpy.set_printoptions(precision=3, suppress=True)

logging.info("Statistics over Verticies ")
logging.info("===============================")
#Other measures :  infection period of tree, infection types
#PCA to find variance of data, correlation matrix, center matrix
X = graph.getVertexList().copy().getVertices(list(range(0, graph.getNumVertices())))

standardiser = Standardiser()
X = standardiser.standardiseArray(X)
centerArray = standardiser.getCentreVector()

C = numpy.dot(X.T, X)

print((Latex.array2DToRows(numpy.reshape(centerArray, (5, 8)))))

C2 = numpy.abs(C - numpy.eye(X.shape[1]))
C2[numpy.tril_indices(C.shape[0])] = 0

inds = numpy.flipud(numpy.argsort(C2, None))
numEls = 10

for i in range(numEls):
    corr = "%.3f" % C[numpy.unravel_index(inds[i], C2.shape)]