def random2Choice(V, n=1): """ Make a random binary choice from a vector V of values which are unnormalised probabilities. Return the corresponding index. For example if v = [1, 2] then the probability of the indices repectively are [1/3, 2/3]. The parameter n is the number of random choices to make. If V is a matrix, then the rows are taken as probabilities, and a choice is made for each row. """ Parameter.checkClass(V, numpy.ndarray) if V.ndim == 1 and V.shape[0] != 2: raise ValueError("Function only works on binary probabilities") if V.ndim == 2 and V.shape[1] != 2: raise ValueError("Function only works on binary probabilities") if V.ndim == 1: cumV = numpy.cumsum(V) p = numpy.random.rand(n)*cumV[-1] cumV2 = numpy.ones(n)*cumV[0] - p return numpy.array(cumV2 <= 0, numpy.int) elif V.ndim == 2: cumV = numpy.cumsum(V, 1) P = numpy.random.rand(V.shape[0], n)*numpy.array([cumV[:, -1]]).T cumV2 = numpy.outer(cumV[:, 0], numpy.ones(n)) - P return numpy.array(cumV2 <= 0, numpy.int) else: raise ValueError("Invalid number of dimensions")
def classify(self, X): Parameter.checkClass(X, numpy.ndarray) if len(self.featureSets) == 0 and len(self.condMatrices) == 0: raise ValueError("Must train before classification.") numExamples = X.shape[0] numFeatures = X.shape[1] y = numpy.zeros((numExamples)) pys = numpy.zeros((numExamples)) for i in range(numExamples): #The probabilities of all choices of y currentPy = numpy.ones((self.labelSet.shape[0])) for j in range(numFeatures): if X[i, j] not in self.featureSets[j]: #If the feature was not in the training data assume uniform probability pYgivenXj = numpy.ones((self.labelSet.shape[0]))/self.labelSet.shape[0] else: fIndex = self.featureSets[j].tolist().index(X[i, j]) pYgivenXj = self.condMatrices[j][:, fIndex] currentPy = currentPy * pYgivenXj pyIndex = numpy.argmax(currentPy) y[i] = self.labelSet[pyIndex] pys[i] = currentPy[pyIndex] logging.info("Classified " + str(numExamples) + " examples and " + str(numFeatures) + " features.") self.pys = pys return y
def setDiff(self, graph): """ Find the edges in the current graph which are not present in the input graph. :param graph: the input graph. :type graph: :class:`apgl.graph.SparseGraph` :returns: A new graph with edges from the current graph and not in the input graph. """ Parameter.checkClass(graph, SparseGraph) if graph.getNumVertices() != self.getNumVertices(): raise ValueError( "Can only add edges from graph with same number of vertices") if self.undirected != graph.undirected: raise ValueError( "Both graphs must be either undirected or directed") A1 = self.nativeAdjacencyMatrix() A2 = graph.nativeAdjacencyMatrix() A1 = A1 - A2 A = (A1 + A1.multiply(A1)) / 2 A.prune() newGraph = SparseGraph(self.vList, self.undirected) newGraph.W = A return newGraph
def concat(self, graph): """ Take a new graph and concatenate it to the current one. Returns a new graph of the concatenated graphs with this graphs vertices first in the new list of vertices. :param graph: the input graph. :type graph: :class:`apgl.graph.SparseGraph` """ Parameter.checkClass(graph, SparseGraph) if type(graph.getVertexList()) != type(self.getVertexList()): raise ValueError("Vertex lists must be of same type") if graph.isUndirected() != self.isUndirected(): raise ValueError("Graphs must be of the same directed type") numVertices = self.getNumVertices() + graph.getNumVertices() vList = GeneralVertexList(numVertices) vList.setVertices(self.getVertexList().getVertices(), list(range(self.getNumVertices()))) vList.setVertices(graph.getVertexList().getVertices(), list(range(self.getNumVertices(), numVertices))) newGraph = SparseGraph(vList) W = scipy.sparse.bmat([[self.W, None], [None, graph.W]], format="csr") newGraph.setWeightMatrixSparse(W) return newGraph
def sequenceVectorStats(self, graph, subgraphIndices, treeStats=False, eigenStats=True): """ Pass in a list of graphs are returns a series of statistics. Each list element is a dict of vector statistics. """ Parameter.checkClass(graph, AbstractMatrixGraph) for inds in subgraphIndices: Parameter.checkList(inds, Parameter.checkInt, [0, graph.getNumVertices()]) Parameter.checkBoolean(treeStats) numGraphs = len(subgraphIndices) statsDictList = [] for i in range(numGraphs): Util.printIteration(i, self.vectorPrintStep, numGraphs) subgraph = graph.subgraph(subgraphIndices[i]) statsDictList.append( self.vectorStatistics(subgraph, treeStats, eigenStats)) return statsDictList
def add(self, graph): """ Add the edge weights of the input graph to the current one. Results in a union of the edges. :param graph: the input graph. :type graph: :class:`apgl.graph.SparseGraph` :returns: A new graph with same vertex list and addition of edge weights """ Parameter.checkClass(graph, SparseGraph) if graph.getNumVertices() != self.getNumVertices(): raise ValueError( "Can only add edges from graph with same number of vertices") if self.undirected != graph.undirected: raise ValueError( "Both graphs must be either undirected or directed") #The ideal way is to add both weight matrices together, but this results in a csr #We'll just do this manually nonZeros = numpy.nonzero(graph.W) newGraph = SparseGraph(self.vList, self.undirected) newGraph.W = self.W.copy() for i in range(len(nonZeros[0])): ind1 = nonZeros[0][i] ind2 = nonZeros[1][i] newGraph.W[ind1, ind2] = self.W[ind1, ind2] + graph.W[ind1, ind2] return newGraph
def sequenceScalarStats(self, graph, subgraphIndices, slowStats=True, treeStats=False): """ Pass in a graph and list of subgraph indices and returns a series of statistics. Each row corresponds to the statistics on the subgraph. """ Parameter.checkClass(graph, AbstractMatrixGraph) for inds in subgraphIndices: Parameter.checkList(inds, Parameter.checkInt, [0, graph.getNumVertices()]) Parameter.checkBoolean(slowStats) Parameter.checkBoolean(treeStats) numGraphs = len(subgraphIndices) statsMatrix = numpy.zeros((numGraphs, self.numStats)) for i in range(numGraphs): Util.printIteration(i, self.printStep, numGraphs) #logging.debug("Subgraph size: " + str(len(subgraphIndices[i]))) subgraph = graph.subgraph(subgraphIndices[i]) statsMatrix[i, :] = self.scalarStatistics(subgraph, slowStats, treeStats) return statsMatrix
def parallelVfcvRbf(self, X, y, idx, type="C_SVC"): """ Perform parallel cross validation model selection using the RBF kernel and then pick the best one. Using the best set of parameters train using the whole dataset. :param X: The examples as rows :type X: :class:`numpy.ndarray` :param y: The binary -1/+1 labels :type y: :class:`numpy.ndarray` :param idx: A list of train/test splits :params returnGrid: Whether to return the error grid :type returnGrid: :class:`bool` """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(y, numpy.ndarray) folds = len(idx) self.setKernel("gaussian") if type=="C_SVC": paramDict = {} paramDict["setC"] = self.getCs() paramDict["setGamma"] = self.getGammas() else: paramDict = {} paramDict["setC"] = self.getCs() paramDict["setGamma"] = self.getGammas() paramDict["setEpsilon"] = self.getEpsilons() return self.parallelModelSelect(X, y, idx, paramDict)
def evaluate(self, X1, X2): """ Find kernel evaluation between two matrices X1 and X2 whose rows are examples and have an identical number of columns. :param X1: First set of examples. :type X1: :class:`numpy.ndarray` :param X2: Second set of examples. :type X2: :class:`numpy.ndarray` """ Parameter.checkClass(X1, numpy.ndarray) Parameter.checkClass(X2, numpy.ndarray) if X1.shape[1] != X2.shape[1]: raise ValueError("Invalid matrix dimentions: " + str(X1.shape) + " " + str(X2.shape)) j1 = numpy.ones((X1.shape[0], 1)) j2 = numpy.ones((X2.shape[0], 1)) diagK1 = numpy.sum(X1**2, 1) diagK2 = numpy.sum(X2**2, 1) X1X2 = numpy.dot(X1, X2.T) Q = (2*X1X2 - numpy.outer(diagK1, j2) - numpy.outer(j1, diagK2) )/ (2*self.sigma**2) return numpy.exp(Q)
def parallelPenaltyGridRbf(svm, X, y, fullX, gridPoints, pdfX, pdfY1X, pdfYminus1X): """ Find out the "ideal" penalty. """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(y, numpy.ndarray) chunkSize = 10 idealPenalties = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) paramList = [] for i in range(svm.Cs.shape[0]): for j in range(svm.gammas.shape[0]): paramList.append((X, y, fullX, svm.Cs[i], svm.gammas[j], gridPoints, pdfX, pdfY1X, pdfYminus1X)) pool = multiprocessing.Pool() resultsIterator = pool.imap(computeIdealPenalty, paramList, chunkSize) for i in range(svm.Cs.shape[0]): for j in range(svm.gammas.shape[0]): idealPenalties[i, j] = resultsIterator.next() pool.terminate() return idealPenalties
def randomChoice(V, n=1): """ Make a random choice from a vector V of values which are unnormalised probabilities. Return the corresponding index. For example if v = [1, 2, 4] then the probability of the indices repectively are [1/7, 2/7, 4/7]. The parameter n is the number of random choices to make. If V is a matrix, then the rows are taken as probabilities, and a choice is made for each row. """ Parameter.checkClass(V, numpy.ndarray) if V.shape[0]==0: return -1 if V.ndim == 1: cumV = numpy.cumsum(V) p = numpy.random.rand(n)*cumV[-1] return numpy.searchsorted(cumV, p) elif V.ndim == 2: cumV = numpy.cumsum(V, 1) P = numpy.random.rand(V.shape[0], n)*numpy.array([cumV[:, -1]]).T inds = numpy.zeros(P.shape, numpy.int) for i in range(P.shape[0]): inds[i, :] = numpy.searchsorted(cumV[i, :], P[i, :]) return inds else: raise ValueError("Invalid number of dimensions")
def evaluate(self, X1, X2): """ Find kernel evaluation between two matrices X1 and X2 whose rows are examples and have an identical number of columns. :param X1: First set of examples. :type X1: :class:`numpy.ndarray` :param X2: Second set of examples. :type X2: :class:`numpy.ndarray` """ Parameter.checkClass(X1, numpy.ndarray) Parameter.checkClass(X2, numpy.ndarray) if X1.shape[1] != X2.shape[1]: raise ValueError("Invalid matrix dimentions: " + str(X1.shape) + " " + str(X2.shape)) j1 = numpy.ones((X1.shape[0], 1)) j2 = numpy.ones((X2.shape[0], 1)) diagK1 = numpy.sum(X1**2, 1) diagK2 = numpy.sum(X2**2, 1) X1X2 = numpy.dot(X1, X2.T) Q = (2*X1X2 - numpy.outer(diagK1, j2) - numpy.outer(j1, diagK2) )/ (2*self.sigma**2) return numpy.exp(Q)
def learnModel(self, X, Y): Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkArray(X) Parameter.checkArray(Y) if numpy.unique(Y).shape[0] < 2: raise ValueError("Vector of labels must be binary, currently numpy.unique(Y) = " + str(numpy.unique(Y))) #If Y is 1D make it 2D if Y.ndim == 1: Y = numpy.array([Y]).T XY = self._getDataFrame(X, Y) formula = robjects.Formula('class ~ .') self.learnModelDataFrame(formula, XY) gc.collect() robjects.r('gc(verbose=TRUE)') robjects.r('memory.profile()') gc.collect() if self.printMemStats: logging.debug(self.getLsos()()) logging.debug(ProfileUtils.memDisplay(locals()))
def setDiff(self, graph): """ Find the edges in the current graph which are not present in the input graph. :param graph: the input graph. :type graph: :class:`apgl.graph.PySparseGraph` :returns: A new graph with edges from the current graph and not in the input graph. """ Parameter.checkClass(graph, PySparseGraph) if graph.getNumVertices() != self.getNumVertices(): raise ValueError( "Can only add edges from graph with same number of vertices") if self.undirected != graph.undirected: raise ValueError( "Both graphs must be either undirected or directed") A1 = self.nativeAdjacencyMatrix() A2 = graph.nativeAdjacencyMatrix() (rows, cols) = PySparseUtils.nonzero(A1) arr1 = numpy.zeros(len(rows)) arr2 = numpy.zeros(len(rows)) A1.take(arr1, rows, cols) A2.take(arr2, rows, cols) arr1 = arr1 - arr2 A1.put(arr1, rows, cols) newGraph = PySparseGraph(self.vList, self.undirected) newGraph.W = A1 return newGraph
def setDiff(self, graph): """ Find the edges in the current graph which are not present in the input graph. :param graph: the input graph. :type graph: :class:`apgl.graph.PySparseGraph` :returns: A new graph with edges from the current graph and not in the input graph. """ Parameter.checkClass(graph, PySparseGraph) if graph.getNumVertices() != self.getNumVertices(): raise ValueError("Can only add edges from graph with same number of vertices") if self.undirected != graph.undirected: raise ValueError("Both graphs must be either undirected or directed") A1 = self.nativeAdjacencyMatrix() A2 = graph.nativeAdjacencyMatrix() (rows, cols) = PySparseUtils.nonzero(A1) arr1 = numpy.zeros(len(rows)) arr2 = numpy.zeros(len(rows)) A1.take(arr1, rows, cols) A2.take(arr2, rows, cols) arr1 = arr1 - arr2 A1.put(arr1, rows, cols) newGraph = PySparseGraph(self.vList, self.undirected) newGraph.W = A1 return newGraph
def setDiff(self, graph): """ Find the edges in the current graph which are not present in the input graph. Replaces the edges in the current graph with adjacencies. :param graph: the input graph. :type graph: :class:`apgl.graph.DenseGraph` :returns: The graph which is the set difference of the edges of this graph and graph. """ Parameter.checkClass(graph, DenseGraph) if graph.getNumVertices() != self.getNumVertices(): raise ValueError( "Can only add edges from graph with same number of vertices") if self.undirected != graph.undirected: raise ValueError( "Both graphs must be either undirected or directed") A1 = self.adjacencyMatrix() A2 = graph.adjacencyMatrix() A1 = A1 - A2 A1 = (A1 + numpy.abs(A1**2)) / 2 newGraph = DenseGraph(self.vList, self.undirected) newGraph.W = A1 return newGraph
def multiply(self, graph): """ Multiply the edge weights of the input graph to the current one. Results in an intersection of the edges. :param graph: the input graph. :type graph: :class:`apgl.graph.PySparseGraph` :returns: A new graph with edge weights which are multiples of the current and graph """ Parameter.checkClass(graph, PySparseGraph) if graph.getNumVertices() != self.getNumVertices(): raise ValueError( "Can only add edges from graph with same number of vertices") if self.undirected != graph.undirected: raise ValueError( "Both graphs must be either undirected or directed") if self.W.nnz < graph.W.nnz: (rows, cols) = PySparseUtils.nonzero(self.W) else: (rows, cols) = PySparseUtils.nonzero(graph.W) arr1 = numpy.zeros(len(rows)) arr2 = numpy.zeros(len(rows)) self.W.take(arr1, rows, cols) graph.W.take(arr2, rows, cols) arr1 = arr1 * arr2 newGraph = PySparseGraph(self.vList, self.undirected) newGraph.W.put(arr1, rows, cols) return newGraph
def scalarStatistics(self, graph): Parameter.checkClass(graph, AbstractMatrixGraph) statsArray = numpy.ones(self.numStats)*-1 #Find geodesic distance between MSMs logging.debug("Running Floyd-Warshall") P = graph.floydWarshall(False) V = graph.getVertexList().getVertices(list(range(graph.getNumVertices()))) bisexual = CsvConverters.orientConv('HB') msmIndices = list(numpy.nonzero(V[:, self.fInds["orient"]]==bisexual)[0]) if len(msmIndices) != 0: statsArray[self.msmGeodesicIndex] = graph.harmonicGeodesicDistance(P, msmIndices) male = CsvConverters.genderConv('M') menIndices = list(numpy.nonzero(V[:, self.fInds["gender"]]==male)[0]) if len(menIndices) != 0: menGraph = graph.subgraph(menIndices) statsArray[self.menSubgraphGeodesicIndex] = menGraph.harmonicGeodesicDistance() contactTrIndices = list(numpy.nonzero(V[:, self.fInds["contactTrace"]]==1)[0]) if len(contactTrIndices) != 0: ctGraph = graph.subgraph(contactTrIndices) statsArray[self.ctSubgraphGeodesicIndex] = ctGraph.harmonicGeodesicDistance() degreeSequence = graph.outDegreeSequence() sortedInds = numpy.argsort(degreeSequence) numInds = int(float(graph.getNumVertices())*self.topConnect) topConnectInds = sortedInds[-numInds:] statsArray[self.mostConnectedGeodesicIndex] = graph.harmonicGeodesicDistance(P, topConnectInds) return statsArray
def predict(self, X): """ Make a prediction for a set of examples given as the rows of the matrix X. :param X: A matrix with examples as rows :type X: :class:`ndarray` :return: A vector of scores corresponding to each example. """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkArray(X) scores = numpy.zeros(X.shape[0]) root = self.tree.getVertex((0, 0)) root.setTestInds(numpy.arange(X.shape[0])) #We go down the tree making predictions at each stage for d in range(self.maxDepth+1): for k in range(2**d): if self.tree.vertexExists((d, k)): self.classifyNode(self.tree, X, d, k) node = self.tree.getVertex((d,k)) if node.isLeafNode(): inds = node.getTestInds() scores[inds] = node.getScore() return scores
def __init__(self, fileName): """ Lock a job whose results are saved as fileName. """ Parameter.checkClass(fileName, str) self.fileName = fileName self.lockFileName = self.fileName + ".lock"
def random2Choice(V, n=1): """ Make a random binary choice from a vector V of values which are unnormalised probabilities. Return the corresponding index. For example if v = [1, 2] then the probability of the indices repectively are [1/3, 2/3]. The parameter n is the number of random choices to make. If V is a matrix, then the rows are taken as probabilities, and a choice is made for each row. """ Parameter.checkClass(V, numpy.ndarray) if V.ndim == 1 and V.shape[0] != 2: raise ValueError("Function only works on binary probabilities") if V.ndim == 2 and V.shape[1] != 2: raise ValueError("Function only works on binary probabilities") if V.ndim == 1: cumV = numpy.cumsum(V) p = numpy.random.rand(n) * cumV[-1] cumV2 = numpy.ones(n) * cumV[0] - p return numpy.array(cumV2 <= 0, numpy.int) elif V.ndim == 2: cumV = numpy.cumsum(V, 1) P = numpy.random.rand(V.shape[0], n) * numpy.array([cumV[:, -1]]).T cumV2 = numpy.outer(cumV[:, 0], numpy.ones(n)) - P return numpy.array(cumV2 <= 0, numpy.int) else: raise ValueError("Invalid number of dimensions")
def randomChoice(V, n=1): """ Make a random choice from a vector V of values which are unnormalised probabilities. Return the corresponding index. For example if v = [1, 2, 4] then the probability of the indices repectively are [1/7, 2/7, 4/7]. The parameter n is the number of random choices to make. If V is a matrix, then the rows are taken as probabilities, and a choice is made for each row. """ Parameter.checkClass(V, numpy.ndarray) if V.shape[0] == 0: return -1 if V.ndim == 1: cumV = numpy.cumsum(V) p = numpy.random.rand(n) * cumV[-1] return numpy.searchsorted(cumV, p) elif V.ndim == 2: cumV = numpy.cumsum(V, 1) P = numpy.random.rand(V.shape[0], n) * numpy.array([cumV[:, -1]]).T inds = numpy.zeros(P.shape, numpy.int) for i in range(P.shape[0]): inds[i, :] = numpy.searchsorted(cumV[i, :], P[i, :]) return inds else: raise ValueError("Invalid number of dimensions")
def learnModel(self, X, Y): """ Learn the weight matrix which matches X and Y. """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkInt(X.shape[0], 1, float('inf')) Parameter.checkInt(X.shape[1], 1, float('inf')) numExamples = X.shape[0] numFeatures = X.shape[1] logging.debug("Training with " + str(numExamples) + " examples and " + str(numFeatures) + " features") I = numpy.eye(numFeatures) XX = numpy.dot(X.T, X) XY = numpy.dot(X.T, Y) invXX = numpy.linalg.inv(XX + self.lmbda*I) self.U = numpy.dot(invXX, XY) logging.debug("Trace of X'X " + str(numpy.trace(XX))) logging.debug("Error " + str(numpy.linalg.norm(numpy.dot(X, self.U) - Y))) return self.U
def multiply(self, graph): """ Multiply the edge weights of the input graph to the current one. Results in an intersection of the edges. :param graph: the input graph. :type graph: :class:`apgl.graph.PySparseGraph` :returns: A new graph with edge weights which are multiples of the current and graph """ Parameter.checkClass(graph, PySparseGraph) if graph.getNumVertices() != self.getNumVertices(): raise ValueError("Can only add edges from graph with same number of vertices") if self.undirected != graph.undirected: raise ValueError("Both graphs must be either undirected or directed") if self.W.nnz < graph.W.nnz: (rows, cols) = PySparseUtils.nonzero(self.W) else: (rows, cols) = PySparseUtils.nonzero(graph.W) arr1 = numpy.zeros(len(rows)) arr2 = numpy.zeros(len(rows)) self.W.take(arr1, rows, cols) graph.W.take(arr2, rows, cols) arr1 = arr1 * arr2 newGraph = PySparseGraph(self.vList, self.undirected) newGraph.W.put(arr1, rows, cols) return newGraph
def __init__(self, kernel, tau1, tau2): Parameter.checkFloat(tau1, 0.0, float('inf')) Parameter.checkFloat(tau2, 0.0, float('inf')) Parameter.checkClass(kernel, AbstractKernel) self.tau1 = tau1 self.tau2 = tau2 self.kernel = kernel
def __init__(self, kernelX, tau1, tau2): Parameter.checkFloat(tau1, 0.0, 1.0) Parameter.checkFloat(tau2, 0.0, 1.0) Parameter.checkClass(kernelX, AbstractKernel) self.kernelX = kernelX self.tau1 = tau1 self.tau2 = tau2
def predict(self, X): """ Basically, return the scores. """ Parameter.checkClass(X, numpy.ndarray) scores = self.predictScores(X) return scores
def evaluateLearn(X, y, idx, learnModel, predict, metricMethod, progress=True): """ Evaluate this learning algorithm using the given list of training/test splits The metricMethod is a method which takes (predictedY, realY) as input and returns a metric about the quality of the evaluation. :param X: A matrix with examples as rows :type X: :class:`ndarray` :param y: A vector of labels :type y: :class:`ndarray` :param idx: A list of training/test splits :type idx: :class:`list` :param learnModel: A function such that learnModel(X, y) finds a mapping from X to y :type learnModel: :class:`function` :param predict: A function such that predict(X) makes predictions for X :type predict: :class:`function` :param metricMethod: A function such that metricMethod(predY, testY) returns the quality of predicted labels predY :type metricMethod: :class:`function` Output: the mean and variation of the cross validation folds. """ #Parameter.checkClass(idx, list) Parameter.checkClass(X, numpy.ndarray) Parameter.checkArray(X, softCheck=True) Parameter.checkInt(X.shape[0], 1, float('inf')) Parameter.checkClass(y, numpy.ndarray) Parameter.checkArray(y, softCheck=True) if y.ndim != 1: raise ValueError("Dimention of y must be 1") i = 0 metrics = numpy.zeros(len(idx)) logging.debug("EvaluateLearn: Using " + str(len(idx)) + " splits on " + str(X.shape[0]) + " examples") for idxtr, idxts in idx: if progress: Util.printConciseIteration(i, 1, len(idx)) trainX, testX = X[idxtr, :], X[idxts, :] trainY, testY = y[idxtr], y[idxts] #logging.debug("Distribution of labels in evaluateLearn train: " + str(numpy.bincount(trainY))) #logging.debug("Distribution of labels in evaluateLearn test: " + str(numpy.bincount(testY))) learnModel(trainX, trainY) predY = predict(testX) gc.collect() metrics[i] = metricMethod(predY, testY) i += 1 return metrics
def vectorStatistics(self, graph, treeStats=False, eigenStats=True): """ Find a series of statistics for the given input graph which can be represented as vector values. """ Parameter.checkClass(graph, AbstractMatrixGraph) Parameter.checkBoolean(treeStats) statsDict = {} statsDict["inDegreeDist"] = graph.inDegreeDistribution() statsDict["outDegreeDist"] = graph.degreeDistribution() logging.debug("Computing hop counts") P = graph.findAllDistances(False) statsDict["hopCount"] = graph.hopCount(P) logging.debug("Computing triangle count") if graph.getNumVertices() != 0: statsDict["triangleDist"] = numpy.bincount( graph.triangleSequence()) else: statsDict["triangleDist"] = numpy.array([]) #Get the distribution of component sizes logging.debug("Finding distribution of component sizes") if graph.isUndirected(): components = graph.findConnectedComponents() if len(components) != 0: statsDict["componentsDist"] = numpy.bincount( numpy.array([len(c) for c in components], numpy.int)) #Make sure weight matrix is symmetric if graph.getNumVertices() != 0 and eigenStats: logging.debug("Computing eigenvalues/vectors") W = graph.getWeightMatrix() W = (W + W.T) / 2 eigenDistribution, V = numpy.linalg.eig(W) i = numpy.argmax(eigenDistribution) statsDict["maxEigVector"] = V[:, i] statsDict["eigenDist"] = numpy.flipud( numpy.sort(eigenDistribution[eigenDistribution > 0])) gc.collect() else: statsDict["maxEigVector"] = numpy.array([]) statsDict["eigenDist"] = numpy.array([]) if treeStats: logging.debug("Computing statistics on trees") trees = graph.findTrees() statsDict["treeSizesDist"] = numpy.bincount( [len(x) for x in trees]) treeDepths = [ GraphUtils.treeDepth((graph.subgraph(x))) for x in trees ] statsDict["treeDepthsDist"] = numpy.bincount(treeDepths) return statsDict
def __init__(self, alterRegressor, egoRegressor): """ The alterRegressor must be a primal method, since the number of alters for each ego vary, and hence the dual vectors are not constant in size. """ Parameter.checkClass(alterRegressor, AbstractPredictor) Parameter.checkClass(egoRegressor, AbstractPredictor) self.alterRegressor = alterRegressor self.egoRegressor = egoRegressor
def predictScores(self, X): """ Make predictions using the learnt tree. Returns the scores as a numpy array. """ Parameter.checkClass(X, numpy.ndarray) predictFunc = robjects.r['predict'] X = self.baseLib.data_frame(X) scores = self.baseLib.matrix(predictFunc(self.getModel(), X)) return numpy.asarray(scores).ravel()
def standardiseArray(self, X): """ Centre and then normalise an array to have norm 1. """ Parameter.checkClass(X, numpy.ndarray) X = self.centreArray(X) X = self.normaliseArray(X) logging.debug("Standardised array of shape " + str(X.shape)) return X
def predictROC(self, X, Y): """ Make predictions using the learnt tree. Returns the ROC curve as a numpy array """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) XY = self._getDataFrame(X, Y) XYROC = self.treeRankLib.getROC(self.getModel(), XY) return numpy.array(XYROC)
def binaryError(testY, predY): """ Work out the error on a set of -1/+1 labels """ Parameter.checkClass(testY, numpy.ndarray) Parameter.checkClass(predY, numpy.ndarray) if testY.shape[0] != predY.shape[0]: raise ValueError("Labels vector much be same dimensions as predicted labels") error = numpy.sum(testY != predY)/float(predY.shape[0]) return error
def project(self, testX): """ Project the examples into the PCA space using k directions :param testX: The examples to project given as rows of a matrix. :type testX: :class:`numpy.ndarray` :returns: The projected examples. """ Parameter.checkClass(testX, numpy.ndarray) return numpy.dot(testX, self.U[:, 0:self.k])
def load(cls, filename): """ Load the graph object from the corresponding file. Data is loaded in a zip format as created using save(). :param filename: The name of the file to load. :type filename: :class:`str` :returns: A graph corresponding to the one saved in filename. """ Parameter.checkClass(filename, str) import zipfile (path, filename) = os.path.split(filename) if path == "": path = "./" tempPath = tempfile.mkdtemp() originalPath = os.getcwd() try: os.chdir(path) myzip = zipfile.ZipFile(filename + '.zip', 'r') myzip.extractall(tempPath) myzip.close() os.chdir(tempPath) #Deal with legacy files try: W = cls.loadMatrix(cls._wFilename) metaDict = Util.loadPickle(cls._metaFilename) vList = globals()[metaDict["vListType"]].load(cls._verticesFilename) undirected = metaDict["undirected"] except IOError: W = cls.loadMatrix(filename + cls._matExt) vList = VertexList.load(filename) undirected = Util.loadPickle(filename + cls._boolExt) graph = cls(vList, undirected) graph.W = W for tempFile in myzip.namelist(): os.remove(tempFile) finally: os.chdir(originalPath) os.rmdir(tempPath) return graph
def setOutDegSequence(self, outDegSequence): """ Set the (out)degree sequence of this object. :param outDegSequence: a vector of degrees for each vertex in the graph. :type outDegSequence: :class:`numpy.ndarray` """ Parameter.checkClass(outDegSequence, numpy.ndarray) if outDegSequence.ndim != 1: raise ValueError("Degree sequence must be one dimensional") Parameter.checkList(outDegSequence, Parameter.checkInt, [0, outDegSequence.shape[0]]) self.outDegSequence = outDegSequence
def _getDataFrame(self, X, Y): """ Create a DataFrame from numpy arrays X and Y """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) X = self.baseLib.data_frame(robjects.vectors.Matrix(X)) Y = self.baseLib.data_frame(robjects.vectors.Matrix(Y)) XY = X.cbind(Y) XY.names[len(XY.names)-1] = "class" return XY
def vectorStatistics(self, graph, treeStats=False, eigenStats=True): """ Find a series of statistics for the given input graph which can be represented as vector values. """ Parameter.checkClass(graph, AbstractMatrixGraph) Parameter.checkBoolean(treeStats) statsDict = {} statsDict["inDegreeDist"] = graph.inDegreeDistribution() statsDict["outDegreeDist"] = graph.degreeDistribution() logging.debug("Computing hop counts") P = graph.findAllDistances(False) statsDict["hopCount"] = graph.hopCount(P) logging.debug("Computing triangle count") if graph.getNumVertices() != 0: statsDict["triangleDist"] = numpy.bincount(graph.triangleSequence()) else: statsDict["triangleDist"] = numpy.array([]) #Get the distribution of component sizes logging.debug("Finding distribution of component sizes") if graph.isUndirected(): components = graph.findConnectedComponents() if len(components) != 0: statsDict["componentsDist"] = numpy.bincount(numpy.array([len(c) for c in components], numpy.int)) #Make sure weight matrix is symmetric if graph.getNumVertices()!=0 and eigenStats: logging.debug("Computing eigenvalues/vectors") W = graph.getWeightMatrix() W = (W + W.T)/2 eigenDistribution, V = numpy.linalg.eig(W) i = numpy.argmax(eigenDistribution) statsDict["maxEigVector"] = V[:, i] statsDict["eigenDist"] = numpy.flipud(numpy.sort(eigenDistribution[eigenDistribution>0])) gc.collect() else: statsDict["maxEigVector"] = numpy.array([]) statsDict["eigenDist"] = numpy.array([]) if treeStats: logging.debug("Computing statistics on trees") trees = graph.findTrees() statsDict["treeSizesDist"] = numpy.bincount([len(x) for x in trees]) treeDepths = [GraphUtils.treeDepth((graph.subgraph(x))) for x in trees] statsDict["treeDepthsDist"] = numpy.bincount(treeDepths) return statsDict
def setOutDegSequence(self, outDegSequence): ''' Set the (out)degree sequence of this object. :param outDegSequence: a vector of degrees for each vertex in the graph. :type outDegSequence: :class:`numpy.ndarray` ''' Parameter.checkClass(outDegSequence, numpy.ndarray) if outDegSequence.ndim != 1: raise ValueError("Degree sequence must be one dimensional") Parameter.checkList(outDegSequence, Parameter.checkInt, [0, outDegSequence.shape[0]]) self.outDegSequence = outDegSequence
def setVertexList(self, vList): """ Assign a new VertexList object to this graph. The number of vertices in the VertexList must be the same as in the graph. :param vList: A new subclass of AbstractVertexList to assign to this graph. :type vList: :class:`apgl.graph.AbstractVertexList` """ Parameter.checkClass(vList, VertexList) if vList.getNumVertices() != self.vList.getNumVertices(): raise ValueError("Can only set to a VertexList with same number of vertices.") self.vList = vList
def save(self, filename): """ Save the graph object to the corresponding filename under the .zip extension. The adjacency matrix is stored in matrix market format and the AbstractVertexList decides how to store the vertex labels. :param filename: The name of the file to save. :type filename: :class:`str` :returns: The name of the saved zip file. """ Parameter.checkClass(filename, str) import zipfile (path, filename) = os.path.split(filename) if path == "": path = "./" tempPath = tempfile.mkdtemp() originalPath = os.getcwd() try: os.chdir(tempPath) self.saveMatrix(self.W, self._wFilename) vListFilename = self.vList.save(self._verticesFilename) metaDict = {} metaDict["version"] = apgl.__version__ metaDict["undirected"] = self.undirected metaDict["vListType"] = self.vList.__class__.__name__ Util.savePickle(metaDict, self._metaFilename) myzip = zipfile.ZipFile(filename + '.zip', 'w') myzip.write(self._wFilename) myzip.write(vListFilename) myzip.write(self._metaFilename) myzip.close() os.remove(self._wFilename) os.remove(vListFilename) os.remove(self._metaFilename) shutil.move(filename + ".zip", path + "/" + filename + '.zip') finally: os.chdir(originalPath) os.rmdir(tempPath) return path + "/" + filename + '.zip'
def summary(self, graph): """ Compute a summary statistic on the input HIV graph """ Parameter.checkClass(graph, HIVGraph) summaryArray = numpy.zeros((self.times.shape[0], 2)) for i in range(self.times.shape[0]): t = self.times[i] subgraph = graph.subgraph(graph.infectedIndsAt(t)) summaryArray[i, :] = numpy.array([subgraph.getNumVertices(), subgraph.getNumEdges()]) return summaryArray
def generate(self, graph, requireEmpty=True): ''' Create an Configuration Model graph. Note the the degree sequence(s) given in the constructor cannot be guarenteed. The algorithm randomly selects two free "spokes" and then tried to connect them. If two vertices are already connected the corresponding spokes are not used again. In the case that requireEmpty is False then a non-empty graph can be used and the given degree sequence(s) is(are) the difference(s) in degrees between the output graph and input one. :param graph: a graph to populate with edges :type graph: :class:`apgl.graph.AbstractMatrixGraph` :param requireEmpty: if this is set to true then we require an empty graph. :type requireEmpty: :class:`bool` :returns: The modified input graph. ''' Parameter.checkClass(graph, AbstractMatrixGraph) if requireEmpty and graph.getNumEdges() != 0: raise ValueError("Graph must have no edges") if graph.getNumVertices() != self.outDegSequence.shape[0]: raise ValueError( "Graph must have same number of vertices as degree sequence") if self.getInDegSequence() != None and graph.isUndirected(): raise ValueError( "In-degree sequence must be used in conjunction with directed graphs" ) if self.getInDegSequence() == None: expandedInds = Util.expandIntArray(self.outDegSequence) numpy.random.shuffle(expandedInds) for i in range(0, len(expandedInds), 2): if i != len(expandedInds) - 1: graph.addEdge(expandedInds[i], expandedInds[i + 1]) else: expandedOutInds = Util.expandIntArray(self.outDegSequence) expandedInInds = Util.expandIntArray(self.inDegSequence) numpy.random.shuffle(expandedOutInds) numpy.random.shuffle(expandedInInds) for i in range( numpy.min( numpy.array([ expandedOutInds.shape[0], expandedInInds.shape[0] ]))): graph.addEdge(expandedOutInds[i], expandedInInds[i]) return graph
def expandIntArray(v): """ Take a vector of integers and expand it into a vector with counts of the corresponding integers. For example, with v = [1, 3, 2, 4], the expanded vector is [0, 1, 1, 1, 2, 2, 3, 3, 3, 3]. """ Parameter.checkClass(v, numpy.ndarray) Parameter.checkList(v, Parameter.checkInt, [0, float('inf')]) w = numpy.zeros(numpy.sum(v), numpy.int) currentInd = 0 for i in range(v.shape[0]): w[currentInd:currentInd + v[i]] = i currentInd += v[i] return w
def setVertex(self, index, value): """ Set a vertex to the corresponding value. :param index: the index of the vertex to assign a value. :type index: :class:`int` :param value: the value to assign to the vertex. :type value: :class:`numpy.ndarray` """ Parameter.checkIndex(index, 0, self.V.shape[0]) Parameter.checkClass(value, numpy.ndarray) #Parameter.checkFloat(value, -float('inf'), float('inf')) if value.shape[0] != self.V.shape[1]: raise ValueError("All vertices must be arrays of length " + str(self.V.shape[1])) self.V[index, :] = value
def setInDegSequence(self, inDegSequence): ''' Set the (in)degree sequence of this object. :param inDegSequence: a vector of degrees for each vertex in the graph. :type inDegSequence: :class:`numpy.ndarray` ''' Parameter.checkClass(inDegSequence, numpy.ndarray) if inDegSequence.ndim != 1: raise ValueError("Degree sequence must be one dimensional") if inDegSequence.shape[0] != self.outDegSequence.shape[0]: raise ValueError( "In-degree sequence must be same length as out-degree sequence" ) Parameter.checkList(inDegSequence, Parameter.checkInt, [0, inDegSequence.shape[0]]) self.inDegSequence = inDegSequence
def matrixPowerh(A, n): """ Compute the matrix power of A using the exponent n. The computation simply evaluated the eigendecomposition of A and then powers the eigenvalue matrix accordingly. This version assumes that A is hermitian. Warning: if at least one eigen-value is negative, n should be an integer. """ Parameter.checkClass(A, numpy.ndarray) tol = 10**-10 lmbda, V = scipy.linalg.eigh(A) lmbda[numpy.abs(lmbda) < tol] = 0 lmbda[numpy.abs(lmbda) > tol] = lmbda[numpy.abs(lmbda) > tol]**n # next line uses the fact that eigh claims returning an orthonormal basis (even if #one sub-space is of dimension >=2) (to be precise, it claims using dsyevd which claims returning an orthonormal matrix) return (V * lmbda).dot(V.T)
def add(self, graph): """ Add the edge weights of the input graph to the current one. Results in a union of the edges. :param graph: the input graph. :type graph: :class:`apgl.graph.DenseGraph` :returns: A new graph with same vertex list and addition of edge weights """ Parameter.checkClass(graph, DenseGraph) if graph.getNumVertices() != self.getNumVertices(): raise ValueError( "Can only add edges from graph with same number of vertices") newGraph = DenseGraph(self.vList, self.undirected) newGraph.W = self.W + graph.W return newGraph
def multiply(self, graph): """ Multiply the edge weights of the input graph to the current one. Results in an intersection of the edges. :param graph: the input graph. :type graph: :class:`apgl.graph.DenseGraph` :returns: A new graph with edge weights which are multiples of the current and graph """ Parameter.checkClass(graph, DenseGraph) if graph.getNumVertices() != self.getNumVertices(): raise ValueError( "Can only add edges from graph with same number of vertices") newGraph = DenseGraph(self.vList, self.undirected) newGraph.W = self.W * graph.W return newGraph
def setWeightMatrix(self, W): """ Set the weight matrix of this graph. Requires as input an ndarray with the same dimensions as the current weight matrix. Edges are represented by non-zero values. :param W: The weight matrix. :type W: :class:`ndarray` """ Parameter.checkClass(W, numpy.ndarray) if W.shape != (self.vList.getNumVertices(), self.vList.getNumVertices()): raise ValueError("Weight matrix has wrong shape : " + str(W.shape)) if self.undirected and (W != W.T).any(): raise ValueError("Weight matrix of undirected graph must be symmetric") self.W = W
def evaluate(self, X1, X2): """ Find kernel evaluation between two matrices X1 and X2 whose rows are examples and have an identical number of columns. :param X1: First set of examples. :type X1: :class:`numpy.ndarray` :param X2: Second set of examples. :type X2: :class:`numpy.ndarray` """ Parameter.checkClass(X1, numpy.ndarray) Parameter.checkClass(X2, numpy.ndarray) if X1.shape[1] != X2.shape[1]: raise ValueError("Invalid matrix dimentions: " + str(X1.shape) + " " + str(X2.shape)) return numpy.dot(X1, X2.T)
def checkClass(self): a = VertexList(10, 1) b = 2 c = True d = SparseGraph(a) Parameter.checkClass(a, VertexList) Parameter.checkClass(b, int) Parameter.checkClass(c, bool) Parameter.checkClass(d, SparseGraph) self.assertRaises(ValueError, Parameter.checkClass, a, SparseGraph) self.assertRaises(ValueError, Parameter.checkClass, b, VertexList)
def generate2(self, graph, requireEmpty=True): """ An alternative way of generating random edges which might work better than generate. """ Parameter.checkClass(graph, AbstractMatrixGraph) if requireEmpty and graph.getNumEdges() != 0: raise ValueError("Graph must have no edges") numVertices = graph.getNumVertices() W = numpy.random.rand(numVertices, numVertices) < self.p W = numpy.array(W, numpy.int) if graph.isUndirected(): diagW = numpy.diag(W) W = numpy.triu(W, 1) W = W + W.T graph.setWeightMatrix(scipy.sparse.csr_matrix(W, dtype=numpy.float)) return graph
def generate(self, graph): """ Create a random graph using the input graph according to the Barabasi-Albert model. Note that the input graph is modified. :param graph: the empty input graph. :type graph: :class:`apgl.graph.AbstractMatrixGraph` :returns: The modified input graph. """ Parameter.checkClass(graph, AbstractMatrixGraph) if graph.getNumEdges() != 0: raise ValueError("Graph must have no edges") #Keep a list of node indices with degrees #First start off with ell vertices assume they each have degree 1, without #adding edges. This is a bit weird but seems the way to do it. vertexList = list(range(0, self.ell)) #Now perform preferential attachment, making sure we add m edges at each #iteration. for i in range(self.ell, graph.getNumVertices()): perm = numpy.random.permutation(len(vertexList)) numEdgesAdded = 0 j = 0 while numEdgesAdded != self.m: ind = perm[j] vertexIndex = vertexList[ind] if graph.getEdge(i, vertexIndex) == None: graph.addEdge(i, vertexIndex) vertexList.append(i) vertexList.append(vertexIndex) numEdgesAdded += 1 j = j + 1 return graph
def generate(self, graph, requireEmpty=True): ''' Create an Erdos-Renyi graph from the given input graph. :param graph: an empty graph to populate with edges :type graph: :class:`apgl.graph.AbstractMatrixGraph` :param requireEmpty: whether to allow non empty graphs. :type requireEmpty: :class:`bool` :returns: The modified input graph. ''' Parameter.checkClass(graph, AbstractMatrixGraph) if requireEmpty and graph.getNumEdges() != 0: raise ValueError("Graph must have no edges") numVertices = graph.getNumVertices() #This function seems slightly weird- sometimes the last cols are empty W = scipy.sparse.rand(numVertices, numVertices, self.p) W = W / W if graph.isUndirected(): diagW = W.diagonal() W = scipy.sparse.triu(W, 1) W = W + W.T if self.selfEdges: W.setdiag(diagW) if not self.selfEdges: W.setdiag(numpy.zeros(numVertices)) if not requireEmpty: W = W + graph.getWeightMatrix() graph.setWeightMatrix(W) return graph
def add(self, graph): """ Add the edge weights of the input graph to the current one. Results in a union of the edges. :param graph: the input graph. :type graph: :class:`apgl.graph.PySparseGraph` :returns: A new graph with same vertex list and addition of edge weights """ Parameter.checkClass(graph, PySparseGraph) if graph.getNumVertices() != self.getNumVertices(): raise ValueError( "Can only add edges from graph with same number of vertices") if self.undirected != graph.undirected: raise ValueError( "Both graphs must be either undirected or directed") newGraph = PySparseGraph(self.vList, self.undirected) newGraph.W = self.W.copy() newGraph.W.shift(1, graph.W) return newGraph