def profileDot2(self): density = 0.01 m = 10000 n = 10000 a_sppy = sppy.rand((m, n), density, storagetype='row') a_sppy_T = sppy.csarray(a_sppy.T, storagetype="col") ProfileUtils.profile('a_sppy.dot(a_sppy_T)', globals(), locals())
def profileDot(self): #Create random sparse matrix and numpy array #Test speed of array creation numpy.random.seed(21) m = 1000000 n = 1000000 numInds = 10000000 inds = numpy.random.randint(0, m*n, numInds) inds = numpy.unique(inds) vals = numpy.random.randn(inds.shape[0]) rowInds, colInds = numpy.unravel_index(inds, (m, n), order="FORTRAN") rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) A = csarray((m, n), storageType="rowMajor") A.put(vals, rowInds, colInds, True) A.compress() p = 500 W = numpy.random.rand(n, p) ProfileUtils.profile('A.dot(W)', globals(), locals()) #Compare versus scipy #B = scipy.sparse.csc_matrix((vals, (rowInds, colInds)), (m, n)) #ProfileUtils.profile('B.dot(W)', globals(), locals()) #Compare versus pdot ProfileUtils.profile('A.pdot(W)', globals(), locals())
def profileModelSelect(self): lmbdas = numpy.linspace(1.0, 0.01, 5) softImpute = IterativeSoftImpute(k=500) folds = 5 cvInds = Sampling.randCrossValidation(folds, self.X.nnz) ProfileUtils.profile('softImpute.modelSelect(self.X, lmbdas, cvInds)', globals(), locals())
def profileObjective(self): k = 10 U = numpy.random.rand(self.m, k) V = numpy.random.rand(self.n, k) indPtr, colInds = SparseUtils.getOmegaListPtr(self.X) colIndsProbabilities = numpy.ones(colInds.shape[0]) for i in range(self.m): colIndsProbabilities[indPtr[i]:indPtr[ i + 1]] /= colIndsProbabilities[indPtr[i]:indPtr[i + 1]].sum() colIndsProbabilities[indPtr[i]:indPtr[i + 1]] = numpy.cumsum( colIndsProbabilities[indPtr[i]:indPtr[i + 1]]) r = numpy.zeros(self.m) lmbda = 0.001 rho = 1.0 numAucSamples = 100 def run(): numRuns = 10 for i in range(numRuns): objectiveApprox(indPtr, colInds, indPtr, colInds, U, V, r, numAucSamples, lmbda, rho, False) ProfileUtils.profile('run()', globals(), locals())
def profileDerivativeUiApprox(self): k = 10 U = numpy.random.rand(self.m, k) V = numpy.random.rand(self.n, k) indPtr, colInds = SparseUtils.getOmegaListPtr(self.X) gp = numpy.random.rand(self.n) gp /= gp.sum() gq = numpy.random.rand(self.n) gq /= gq.sum() j = 3 numRowSamples = 100 numAucSamples = 10 permutedRowInds = numpy.array(numpy.random.permutation(self.m), numpy.uint32) permutedColInds = numpy.array(numpy.random.permutation(self.n), numpy.uint32) maxLocalAuc = MaxLocalAUC(k, w=0.9) normGp, normGq = maxLocalAuc.computeNormGpq(indPtr, colInds, gp, gq, self.m) lmbda = 0.001 normalise = True learner = MaxLocalAUCCython() def run(): numRuns = 10 for j in range(numRuns): for i in range(self.m): learner.derivativeUiApprox(indPtr, colInds, U, V, gp, gq, permutedColInds, i) ProfileUtils.profile("run()", globals(), locals())
def profileObjective(self): k = 10 U = numpy.random.rand(self.m, k) V = numpy.random.rand(self.n, k) indPtr, colInds = SparseUtils.getOmegaListPtr(self.X) colIndsProbabilities = numpy.ones(colInds.shape[0]) for i in range(self.m): colIndsProbabilities[indPtr[i] : indPtr[i + 1]] /= colIndsProbabilities[indPtr[i] : indPtr[i + 1]].sum() colIndsProbabilities[indPtr[i] : indPtr[i + 1]] = numpy.cumsum( colIndsProbabilities[indPtr[i] : indPtr[i + 1]] ) r = numpy.zeros(self.m) lmbda = 0.001 rho = 1.0 numAucSamples = 100 def run(): numRuns = 10 for i in range(numRuns): objectiveApprox(indPtr, colInds, indPtr, colInds, U, V, r, numAucSamples, lmbda, rho, False) ProfileUtils.profile("run()", globals(), locals())
def profileLearnModel(self): #Profile full gradient descent X, U, V = DatasetUtils.syntheticDataset1(u=0.01, m=1000, n=2000) #X, U, V = DatasetUtils.syntheticDataset1() #X, U, V = DatasetUtils.syntheticDataset1(u=0.2, sd=0.2) #X = DatasetUtils.flixster() u = 0.2 w = 1 - u eps = 10**-6 alpha = 0.5 maxLocalAuc = MaxLocalAUC(self.k, w, alpha=alpha, eps=eps, stochastic=True) maxLocalAuc.maxNormU = 10 maxLocalAuc.maxNormV = 10 maxLocalAuc.maxIterations = 100 maxLocalAuc.initialAlg = "rand" maxLocalAuc.rate = "constant" maxLocalAuc.parallelSGD = True maxLocalAuc.numProcesses = 8 maxLocalAuc.numAucSamples = 10 maxLocalAuc.numRowSamples = 30 maxLocalAuc.scaleAlpha = False maxLocalAuc.loss = "hinge" maxLocalAuc.validationUsers = 0.0 print(maxLocalAuc) ProfileUtils.profile('maxLocalAuc.learnModel(X)', globals(), locals())
def profileRunExperiment(self): def run(): dataArgs = argparse.Namespace() dataArgs.maxIter = 3 #Set iterStartDate to None for all iterations #dataArgs.iterStartTimeStamp = None dataArgs.iterStartTimeStamp = time.mktime(datetime(2005,1,1).timetuple()) generator = MovieLensDataset(maxIter=dataArgs.maxIter, iterStartTimeStamp=dataArgs.iterStartTimeStamp) defaultAlgoArgs = argparse.Namespace() defaultAlgoArgs.ks = numpy.array(2**numpy.arange(6, 7, 0.5), numpy.int) defaultAlgoArgs.svdAlgs = ["rsvd"] defaultAlgoArgs.runSoftImpute = True dataParser = argparse.ArgumentParser(description="", add_help=False) dataParser.add_argument("-h", "--help", action="store_true", help="show this help message and exit") devNull, remainingArgs = dataParser.parse_known_args(namespace=dataArgs) dataArgs.extendedDirName = "" dataArgs.extendedDirName += "MovieLensDataset" recommendExpHelper = RecommendExpHelper(generator.getTrainIteratorFunc, generator.getTestIteratorFunc, remainingArgs, defaultAlgoArgs, dataArgs.extendedDirName) recommendExpHelper.printAlgoArgs() # os.makedirs(resultsDir, exist_ok=True) # for python 3.2 try: os.makedirs(recommendExpHelper.resultsDir) except OSError as err: if err.errno != errno.EEXIST: raise recommendExpHelper.runExperiment() ProfileUtils.profile('run()', globals(), locals())
def profileEigpsd(self): n = 1000 p = 0.1 L = scipy.sparse.rand(n, n, p) L = L.T.dot(L) cols = 500 ProfileUtils.profile('Nystrom.eigpsd(L, cols)', globals(), locals())
def profilePutPySparse(self): def runPut(): A = spmatrix.ll_mat(self.N, self.N) for i in range(self.k): A.put(self.val, self.rowInds, self.colInds) ProfileUtils.profile('runPut()', globals(), locals())
def profileSvd(self): n = 5000 p = 0.1 L = scipy.sparse.rand(n, n, p) L = L.T.dot(L) k = 50 q = 2 ProfileUtils.profile('RandomisedSVD.svd(L, k, q)', globals(), locals())
def profileGenerateSparseBinaryMatrixPL(self): m = 500 n = 200 k = 10 density = 0.2 numpy.random.seed(21) #X = SparseUtils.generateSparseBinaryMatrixPL((m,n), k, density=density, csarray=True) ProfileUtils.profile('SparseUtilsCython.generateSparseBinaryMatrixPL((m,n), k, density=density, csarray=True)', globals(), locals())
def profileMC2(self): numVals = 10000 list1 = numpy.random.permutation(numVals).tolist() list2 = numpy.random.permutation(numVals).tolist() lists = [list1, list2] itemList = numpy.arange(numVals).tolist() ProfileUtils.profile('RankAggregator.MC2(lists, itemList)', globals(), locals())
def profileModelSelection(self): dataset = ArnetMinerDataset(runLSI=False) dataset.overwrite = True dataset.overwriteVectoriser = True dataset.overwriteModel = True dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" ProfileUtils.profile('dataset.modelSelection()', globals(), locals())
def profilePartialReconstructValsPQ(self): shape = 5000, 10000 r = 100 U, s, V = SparseUtils.generateLowRank(shape, r) k = 1000000 inds = numpy.unravel_index(numpy.random.randint(0, shape[0]*shape[1], k), dims=shape) ProfileUtils.profile('SparseUtilsCython.partialReconstructValsPQ(inds[0], inds[1], U, V)', globals(), locals())
def profilePut2(self): def runPut(): for i in range(self.k): A = csarray((self.N, self.N)) #A[(self.rowInds, self.colInds)] = self.val A.put(self.val, self.rowInds, self.colInds) ProfileUtils.profile('runPut()', globals(), locals())
def profileGreedyMethod2(self): n = 1000 p = 0.1 graph = igraph.Graph.Erdos_Renyi(n, p) print(graph.summary()) k = 5 numpy.random.seed(21) ProfileUtils.profile('MaxInfluence.greedyMethod2(graph, k, p=0.5, numRuns=1000)', globals(), locals())
def profileSliceSpa(self): A = csarray((self.N, self.N)) A.put(self.val, self.rowInds, self.colInds) def runSlice(): for i in range(10): sliceInds = numpy.array(numpy.random.randint(0, self.M, self.N), dtype=numpy.int) B = A[:, sliceInds] ProfileUtils.profile('runSlice()', globals(), locals())
def profileSumPys(self): A = spmatrix.ll_mat(self.N, self.N) A.put(self.val, self.rowInds, self.colInds) def runSum(): for i in range(1000): i = PySparseUtils.sum(A) print(i) ProfileUtils.profile('runSum()', globals(), locals())
def profileRandomChoice(self): a = numpy.random.randint(0, 10, 100) b = numpy.random.rand(100) b /= b.sum() def run(): for i in range(10000): numpy.random.choice(a, 1000, p=b) ProfileUtils.profile('run()', globals(), locals())
def profileSumSpa(self): A = csarray((self.N, self.N)) A.put(self.val, self.rowInds, self.colInds) def runSum(): for i in range(1000): i = A.sum() print(i) ProfileUtils.profile('runSum()', globals(), locals())
def profileClusterFromIterator(self): iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList) dataDir = PathDefaults.getDataDir() + "cluster/" #iterator = getBemolGraphIterator(dataDir) def run(): clusterList, timeList, boundList = self.clusterer.clusterFromIterator(iterator, verbose=True) print(timeList.cumsum(0)) ProfileUtils.profile('run()', globals(), locals())
def profileRowSlice(self): numpy.random.seed(21) m = 100000 n = 1000000 #numInds = 10000000 X = sppy.rand((m, n), density=0.001, storagetype="row") #ProfileUtils.profile('X[0:1000, :] ', globals(), locals()) ProfileUtils.profile('X.submatrix(0, 0, 1000, n)', globals(), locals())
def profileComputeLDA(self): field = "Boosting" dataset = ArnetMinerDataset(field) dataset.overwrite = True dataset.overwriteVectoriser = True dataset.overwriteModel = True dataset.maxRelevantAuthors = 100 dataset.k = 200 dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" ProfileUtils.profile('dataset.computeLDA()', globals(), locals())
def profileGetNonZerosSpa(self): A = csarray((self.N, self.N)) A.put(self.val, self.rowInds, self.colInds) def runNonZeros(): for i in range(1000): rows, cols = A.nonzero() vals = A[rows, cols] print(numpy.sum(vals)) ProfileUtils.profile('runNonZeros()', globals(), locals())
def profileGetOmegaList(self): shape = (20000, 15000) r = 50 k = 1000000 X = SparseUtils.generateSparseLowRank(shape, r, k) import sppy X = sppy.csarray(X) ProfileUtils.profile('SparseUtils.getOmegaList(X)', globals(), locals())
def profileGetNonZerosPys(self): A = spmatrix.ll_mat(self.N, self.N) A.put(self.val, self.rowInds, self.colInds) def runNonZeros(): for i in range(1000): (rows, cols) = PySparseUtils.nonzero(A) nzVals = numpy.zeros(len(rows)) A.take(nzVals, rows, cols) ProfileUtils.profile('runNonZeros()', globals(), locals())
def profileTrainIterator(self): def run(): dataset = NetflixDataset(maxIter=30) trainIterator = dataset.getTrainIteratorFunc() for trainX in trainIterator: print(trainX.shape) ProfileUtils.profile('run()', globals(), locals())
def profileSubmatrix(self): shape = (100000, 15000) r = 50 k = 5000000 X = SparseUtils.generateSparseLowRank(shape, r, k) print(X.nnz, type(X)) inds = numpy.random.permutation(X.nnz)[0:1000000] ProfileUtils.profile("SparseUtils.submatrix(X, inds)", globals(), locals())
def profileSimulateCascades(self): n = 500 p = 0.1 graph = igraph.Graph.Erdos_Renyi(n, p) k = 50 activeVertices = set(numpy.random.randint(0, n, 10)) numRuns = 100 ProfileUtils.profile('MaxInfluence.simulateCascades(graph, activeVertices, numRuns, p=0.5)', globals(), locals())
def profileGetOmegaList(self): shape = (20000, 15000) r = 50 k = 1000000 X = SparseUtils.generateSparseLowRank(shape, r, k) import sppy X = sppy.csarray(X) ProfileUtils.profile("SparseUtils.getOmegaList(X)", globals(), locals())
def profileRandom2Choice(self): n = 1000 m = 1000 V = numpy.random.rand(n, 2) def runRandom2Choice(): reps = 100 for i in range(reps): Util.randomChoice(V, m) ProfileUtils.profile('runRandom2Choice()', globals(), locals())
def profileInverseChoice(self): n = 100000 v = numpy.array(numpy.random.choice(n, 100), numpy.int32) v = numpy.sort(v) def run(): numRuns = 2000000 for i in range(numRuns): inverseChoicePy(v, n) ProfileUtils.profile('run()', globals(), locals())
def profileSubmatrix(self): shape = (100000, 15000) r = 50 k = 5000000 X = SparseUtils.generateSparseLowRank(shape, r, k) print(X.nnz, type(X)) inds = numpy.random.permutation(X.nnz)[0:1000000] ProfileUtils.profile('SparseUtils.submatrix(X, inds)', globals(), locals())
def profileLearnModel(self, useProfiler=True, eps=10**(-6)): k = 100 lmbda = 0.001 tmax=10**7 gamma = 1 learner = SGDNorm2Reg(k, lmbda, eps, tmax) if useProfiler: ProfileUtils.profile('learner.learnModel(self.X, storeAll=False)', globals(), locals()) else: learner.learnModel(self.X, storeAll=False)
def profileParallelSparseOp2(self): L = LinOperatorUtils.parallelSparseOp(self.X) p = 300 W = numpy.random.rand(self.X.shape[1], p) def run(): numRuns = 1 for i in range(numRuns): L.matmat(W) ProfileUtils.profile('run()', globals(), locals())
def profileDecisionTreeRegressor(self): numExamples = 1000 numFeatures = 20 minSplit = 10 maxDepth = 20 generator = ExamplesGenerator() X, y = generator.generateBinaryExamples(numExamples, numFeatures) regressor = DecisionTreeRegressor(min_split=minSplit, max_depth=maxDepth, min_density=0.0) ProfileUtils.profile('regressor.fit(X, y)', globals(), locals())
def profileAsLinearOperator2(self): L = scipy.sparse.linalg.aslinearoperator(self.X) p = 300 W = numpy.random.rand(self.X.shape[1], p) def run(): numRuns = 1 for i in range(numRuns): L.matmat(W) ProfileUtils.profile('run()', globals(), locals())
def profileClusterFromIterator(self): iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList) dataDir = PathDefaults.getDataDir() + "cluster/" #iterator = getBemolGraphIterator(dataDir) def run(): clusterList, timeList, boundList = self.clusterer.clusterFromIterator( iterator, verbose=True) print(timeList.cumsum(0)) ProfileUtils.profile('run()', globals(), locals())
def profileRestrictOmega(self): X, U, V = DatasetUtils.syntheticDataset1(u=0.01, m=1000, n=2000) m, n = X.shape indPtr, colInds = SparseUtils.getOmegaListPtr(X) colIndsSubset = numpy.random.choice(n, 500, replace=False) def run(): for i in range(100): newIndPtr, newColInds = restrictOmega(indPtr, colInds, colIndsSubset) ProfileUtils.profile('run()', globals(), locals())
def profileParallelSparseOp(self): L = LinOperatorUtils.parallelSparseOp(self.X) def run(): numRuns = 10 for i in range(numRuns): p = numpy.random.rand(self.X.shape[0]) q = numpy.random.rand(self.X.shape[1]) L.matvec(q) L.rmatvec(p) ProfileUtils.profile('run()', globals(), locals())
def profileAsLinearOperator(self): L = scipy.sparse.linalg.aslinearoperator(self.X) def run(): numRuns = 10 for i in range(numRuns): p = numpy.random.rand(self.X.shape[0]) q = numpy.random.rand(self.X.shape[1]) L.matvec(q) L.rmatvec(p) ProfileUtils.profile('run()', globals(), locals())
def profileRandomChoice(self): n = 10000 m = 1000 maxInt = 20 v = numpy.random.randint(0, maxInt, n) def runRandomChoice(): reps = 10000 for i in range(reps): Util.randomChoice(v, m) ProfileUtils.profile('runRandomChoice()', globals(), locals())
def profileEigenRemove(self): k = 50 n = 1000 X = numpy.random.rand(n, n) m = 900 XX = X.dot(X.T) self.omega, self.Q = numpy.linalg.eig(XX) def runEigenRemove(): for i in range(10): EigenUpdater.eigenRemove(self.omega, self.Q, m, k) ProfileUtils.profile('runEigenRemove()', globals(), locals())
def profileEigenConcat(self): k = 10 n = 1000 m = 100 X = numpy.random.rand(n, n) XX = X.dot(X.T) self.AA = XX[0:m, 0:m] self.AB = XX[0:m, m:] self.BB = XX[m:, m:] self.omega, self.Q = numpy.linalg.eig(self.AA) ProfileUtils.profile('EigenUpdater.eigenConcat(self.omega, self.Q, self.AB, self.BB, k)', globals(), locals())
def profileFitDiscretePowerLaw(self): #Test with a large vector x alpha = 2.5 exponent = (1 / (alpha - 1)) numPoints = 50000 x = 10 * numpy.random.rand(numPoints)**-exponent x = numpy.array(numpy.round(x), numpy.int) x = x[x <= 500] x = x[x >= 1] xmins = numpy.arange(1, 20) ProfileUtils.profile('Util.fitDiscretePowerLaw(x, xmins)', globals(), locals())
def profileLearnModel(self): #Profile full gradient descent u = 0.2 w = 1-u eps = 10**-6 alpha = 0.5 learner = BprRecommender(self.k) learner.maxIterations = 10 learner.recordStep = 10 learner.numAucSamples = 5 print(learner) print(self.X.nnz) ProfileUtils.profile('learner.learnModel(self.X)', globals(), locals())
def profileAltRandomChoice(self): n = 10000 m = 1000 maxInt = 20 v = numpy.random.randint(0, maxInt, n) def runRandomChoice(): #can just do non-zero entries w = Util.expandIntArray(v) reps = 10000 for i in range(reps): w[numpy.random.randint(0, w.shape[0])] ProfileUtils.profile('runRandomChoice()', globals(), locals())
def profileLearnModel(self): numExamples = 1000 numFeatures = 50 minSplit = 10 maxDepth = 20 generator = ExamplesGenerator() X, y = generator.generateBinaryExamples(numExamples, numFeatures) y = numpy.array(y, numpy.float) learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth, pruneType="REP-CV") #learner.learnModel(X, y) #print("Done") ProfileUtils.profile('learner.learnModel(X, y) ', globals(), locals()) print(learner.getTree().getNumVertices())
def learnModel(self, X, Y): Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkArray(X) Parameter.checkArray(Y) if numpy.unique(Y).shape[0] < 2: raise ValueError( "Vector of labels must be binary, currently numpy.unique(Y) = " + str(numpy.unique(Y))) #If Y is 1D make it 2D if Y.ndim == 1: Y = numpy.array([Y]).T XY = self._getDataFrame(X, Y) formula = robjects.Formula('class ~ .') self.learnModelDataFrame(formula, XY) gc.collect() robjects.r('gc(verbose=TRUE)') robjects.r('memory.profile()') gc.collect() if self.printMemStats: logging.debug(self.getLsos()()) logging.debug(ProfileUtils.memDisplay(locals()))
def profileEigenAdd2(self): k = 10 n = 1000 m = 200 X = numpy.random.rand(n, n) Y = numpy.random.rand(n, m) XX = X.dot(X.T) self.omega, self.Q = numpy.linalg.eig(XX) def runEigenAdd2(): for i in range(10): EigenUpdater.eigenAdd2(self.omega, self.Q, Y, Y, k) ProfileUtils.profile('runEigenAdd2()', globals(), locals())
def profileModelSelect(self): learner = LibSVM() numExamples = 10000 numFeatures = 10 X = numpy.random.rand(numExamples, numFeatures) Y = numpy.array(numpy.random.rand(numExamples) < 0.1, numpy.int) * 2 - 1 def run(): for i in range(5): print("Iteration " + str(i)) idx = Sampling.crossValidation(self.folds, numExamples) learner.parallelModelSelect(X, Y, idx, self.paramDict) ProfileUtils.profile('run()', globals(), locals())
def profilePredict(self): #Make the prdiction function faster numExamples = 1000 numFeatures = 20 minSplit = 1 maxDepth = 20 generator = ExamplesGenerator() X, y = generator.generateBinaryExamples(numExamples, numFeatures) learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) learner.learnModel(X, y) print(learner.getTree().getNumVertices()) ProfileUtils.profile('learner.predict(X)', globals(), locals()) print(learner.getTree().getNumVertices())
def profileShuffleSplitRows(self): m = 10000 n = 5000 k = 5 u = 0.1 w = 1 - u X, U, s, V = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True, verbose=True, indsPerRow=200) k2 = 10 testSize = 2 ProfileUtils.profile('Sampling.shuffleSplitRows(X, k2, testSize)', globals(), locals())