def __loadLeafRanks(self): utilFileName = PathDefaults.getSourceDir( ) + "/apgl/metabolomics/R/Util.R" leafRanksFileName = PathDefaults.getSourceDir( ) + "/apgl/metabolomics/R/MSLeafRanks.R" robjects.r["source"](utilFileName) robjects.r["source"](leafRanksFileName)
def __init__(self, field): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "dblp/" self.xmlFileName = dataDir + "dblp.xml" self.xmlCleanFilename = dataDir + "dblpClean.xml" resultsDir = PathDefaults.getDataDir() + "reputation/" + field + "/" self.expertsFileName = resultsDir + "experts.txt" self.expertMatchesFilename = resultsDir + "experts_matches.csv" self.trainExpertMatchesFilename = resultsDir + "experts_train_matches.csv" self.testExpertMatchesFilename = resultsDir + "experts_test_matches.csv" self.coauthorsFilename = resultsDir + "coauthors.csv" self.publicationsFilename = resultsDir + "publications.csv" self.stepSize = 100000 self.numLines = 33532888 self.publicationTypes = set(["article" , "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"]) self.p = 0.5 self.matchCutoff = 0.95 self.cleanXML() self.matchExperts() logging.warning("Now you must disambiguate the matched experts if not ready done")
def __init__(self, maxIter=None, iterStartTimeStamp=None): outputDir = PathDefaults.getOutputDir() + "recommend/erasm/" if not os.path.exists(outputDir): os.mkdir(outputDir) #iterStartDate is the starting date of the iterator if iterStartTimeStamp != None: self.iterStartTimeStamp = iterStartTimeStamp else: self.iterStartTimeStamp = 1286229600 self.timeStep = timedelta(30).total_seconds() self.ratingFileName = outputDir + "data.npz" self.userDictFileName = outputDir + "userIdDict.pkl" self.groupDictFileName = outputDir + "groupIdDict.pkl" self.isTrainRatingsFileName = outputDir + "is_train.npz" self.dataDir = PathDefaults.getDataDir() + "erasm/" self.dataFileName = self.dataDir + "groupMembers-29-11-12" self.maxIter = maxIter self.trainSplit = 4.0/5 self.processRatings() self.splitDataset() self.loadProcessedData()
def processSimpleDataset(name, numRealisations, split, ext=".csv", delimiter=",", usecols=None, skiprows=1, converters=None): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" fileName = dataDir + name + ext print("Loading data from file " + fileName) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "/" XY = numpy.loadtxt(fileName, delimiter=delimiter, skiprows=skiprows, usecols=usecols, converters=converters) X = XY[:, :-1] y = XY[:, -1] idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split) preprocessSave(X, y, outputDir, idx)
def testGenerateRandomGraph(self): egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv" alterFileName = PathDefaults.getDataDir() + "infoDiffusion/AlterData.csv" numVertices = 1000 infoProb = 0.1 p = 0.1 neighbours = 10 generator = SmallWorldGenerator(p, neighbours) graph = SparseGraph(VertexList(numVertices, 0)) graph = generator.generate(graph) self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph)
def __init__(self, YList, X, featuresName, ages, args): super(MetabolomicsExpRunner, self).__init__(args=args) self.X = X self.YList = YList # The list of concentrations self.featuresName = featuresName self.args = args self.ages = ages self.maxDepth = 5 self.numTrees = 10 self.folds = 3 self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/" self.leafRankGenerators = [] # self.leafRankGenerators.append((SvcGS.generate(), "SVC")) # self.leafRankGenerators.append((LinearSvmGS.generate(), "LinearSVM")) self.leafRankGenerators.append((LinearSvmPca.generate(), "LinearSVM-PCA")) self.funcLeafRankGenerators = [] # self.funcLeafRankGenerators.append((LinearSvmFGs.generate, "SVMF")) # self.funcLeafRankGenerators.append((DecisionTreeF.generate, "CARTF")) self.funcLeafRankGenerators.append((SvcFGs.generate, "SVCF")) # Store all the label vectors and their missing values YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList) self.hormoneInds = [YIgf1Inds, YICortisolInds, YTestoInds] self.hormoneNames = MetabolomicsUtils.getLabelNames()
def getLsos(self): """ Return a function to display R memory usage """ fileName = PathDefaults.getSourceDir() + "/apgl/metabolomics/R/Util.R" robjects.r["source"](fileName) return robjects.r['lsos']
def testLoadParams(self): try: lmbda = 0.01 alterRegressor = PrimalRidgeRegression(lmbda) egoRegressor = PrimalRidgeRegression(lmbda) predictor = EgoEdgeLabelPredictor(alterRegressor, egoRegressor) params = [0.1, 0.2] paramFuncs = [egoRegressor.setLambda, alterRegressor.setLambda] fileName = PathDefaults.getTempDir() + "tempParams.pkl" predictor.saveParams(params, paramFuncs, fileName) params2 = predictor.loadParams(fileName) self.assertTrue( params2[0][0] == "apgl.predictors.PrimalRidgeRegression") self.assertTrue(params2[0][1] == "setLambda") self.assertTrue(params2[0][2] == 0.1) self.assertTrue( params2[1][0] == "apgl.predictors.PrimalRidgeRegression") self.assertTrue(params2[1][1] == "setLambda") self.assertTrue(params2[1][2] == 0.2) except IOError as e: logging.warn(e)
def __init__(self): self.labelNames = ["Cortisol.val", "Testosterone.val", "IGF1.val"] self.dataDir = PathDefaults.getDataDir() + "metabolomic/" self.boundsDict = {} self.boundsDict["Cortisol"] = numpy.array([0, 89, 225, 573]) self.boundsDict["Testosterone"] = numpy.array([0, 3, 9, 13]) self.boundsDict["IGF1"] = numpy.array([0, 200, 441, 782])
def testEstimate(self): #Lets set up a simple model based on normal dist abcParams = ABCParameters() epsilonArray = numpy.array([0.5, 0.2, 0.1]) posteriorSampleSize = 20 #Lets get an empirical estimate of Sprime model = NormalModel(abcMetrics) model.setMu(theta[0]) model.setSigma(theta[1]) Sprime = abcMetrics.summary(model.simulate()) logging.debug(("Real summary statistic: " + str(Sprime))) thetaDir = PathDefaults.getTempDir() abcSMC = ABCSMC(epsilonArray, createNormalModel, abcParams, thetaDir) abcSMC.maxRuns = 100000 abcSMC.setPosteriorSampleSize(posteriorSampleSize) thetasArray = abcSMC.run() thetasArray = numpy.array(thetasArray) meanTheta = numpy.mean(thetasArray, 0) logging.debug((thetasArray.shape)) logging.debug(thetasArray) logging.debug(meanTheta) print(thetasArray.shape[0], posteriorSampleSize) #Note only mean needs to be similar self.assertTrue(thetasArray.shape[0] >= posteriorSampleSize) self.assertEquals(thetasArray.shape[1], 2) self.assertTrue(numpy.linalg.norm(theta[0] - meanTheta[0]) < 0.2)
def testLoadParams(self): try: lmbda = 0.01 alterRegressor = PrimalRidgeRegression(lmbda) egoRegressor = PrimalRidgeRegression(lmbda) predictor = EgoEdgeLabelPredictor(alterRegressor, egoRegressor) params = [0.1, 0.2] paramFuncs = [egoRegressor.setLambda, alterRegressor.setLambda] fileName = PathDefaults.getTempDir() + "tempParams.pkl" predictor.saveParams(params, paramFuncs, fileName) params2 = predictor.loadParams(fileName) self.assertTrue(params2[0][0] == "apgl.predictors.PrimalRidgeRegression") self.assertTrue(params2[0][1] == "setLambda") self.assertTrue(params2[0][2] == 0.1) self.assertTrue(params2[1][0] == "apgl.predictors.PrimalRidgeRegression") self.assertTrue(params2[1][1] == "setLambda") self.assertTrue(params2[1][2] == 0.2) except IOError as e: logging.warn(e)
def testEdgeFile(self): """ Figure out the problem with the edge file """ dataDir = PathDefaults.getDataDir() + "cluster/" edgesFilename = dataDir + "Cit-HepTh.txt" edges = {} file = open(edgesFilename, 'r') file.readline() file.readline() file.readline() file.readline() vertices = {} for line in file: (vertex1, sep, vertex2) = line.partition("\t") vertex1 = vertex1.strip() vertex2 = vertex2.strip() edges[(vertex1, vertex2)] = 0 vertices[vertex1] = 0 vertices[vertex2] = 0 #It says there are 352807 edges in paper and 27770 vertices self.assertEquals(len(edges), 352807) self.assertEquals(len(vertices), 27770)
def testPredict2(self): # Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) maxDepths = range(3, 10) trainAucs = numpy.array( [0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508] ) testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400]) i = 0 # The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRank = TreeRank(self.leafRanklearner) treeRank.setMaxDepth(maxDepth) treeRank.learnModel(X, y) trainScores = treeRank.predict(X) testScores = treeRank.predict(testX) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i += 1
def __init__(self, YList, X, featuresName, ages, args): super(MetabolomicsExpRunner, self).__init__(args=args) self.X = X self.YList = YList #The list of concentrations self.featuresName = featuresName self.args = args self.ages = ages self.maxDepth = 5 self.numTrees = 10 self.folds = 3 self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/" self.leafRankGenerators = [] #self.leafRankGenerators.append((SvcGS.generate(), "SVC")) #self.leafRankGenerators.append((LinearSvmGS.generate(), "LinearSVM")) self.leafRankGenerators.append((LinearSvmPca.generate(), "LinearSVM-PCA")) self.funcLeafRankGenerators = [] #self.funcLeafRankGenerators.append((LinearSvmFGs.generate, "SVMF")) #self.funcLeafRankGenerators.append((DecisionTreeF.generate, "CARTF")) self.funcLeafRankGenerators.append((SvcFGs.generate, "SVCF")) #Store all the label vectors and their missing values YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList) self.hormoneInds = [YIgf1Inds, YICortisolInds, YTestoInds] self.hormoneNames = MetabolomicsUtils.getLabelNames()
def flixster(minNnzRows=10, minNnzCols=2, quantile=90): matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" matrixFile = open(matrixFileName) matrixFile.readline() userIndexer = IdIndexer("i") movieIndexer = IdIndexer("i") ratings = array.array("f") logging.debug("Loading ratings from " + matrixFileName) for i, line in enumerate(matrixFile): if i % 1000000 == 0: logging.debug("Iteration: " + str(i)) vals = line.split() userIndexer.append(vals[0]) movieIndexer.append(vals[1]) ratings.append(float(vals[2])) rowInds = userIndexer.getArray() colInds = movieIndexer.getArray() ratings = numpy.array(ratings) X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int) X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True) X.prune() X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) #X = Sampling.sampleUsers(X, 1000) return X
def epinions(minNnzRows=10, minNnzCols=3, quantile=90): matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" A = scipy.io.loadmat(matrixFileName)["rating"] userIndexer = IdIndexer("i") itemIndexer = IdIndexer("i") for i in range(A.shape[0]): userIndexer.append(A[i, 0]) itemIndexer.append(A[i, 1]) rowInds = userIndexer.getArray() colInds = itemIndexer.getArray() ratings = A[:, 3] X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int) X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True) X.prune() X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) return X
def testToyData(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] pxSum = 0 pY1XSum = 0 pYminus1XSum = 0 px2Sum = 0 squareArea = (gridPoints[1]-gridPoints[0])**2 for i in range(gridPoints.shape[0]-1): for j in range(gridPoints.shape[0]-1): px = (pdfX[i,j]+pdfX[i+1,j]+pdfX[i, j+1]+pdfX[i+1, j+1])/4 pxSum += px*squareArea pY1X = (pdfY1X[i,j]+pdfY1X[i+1,j]+pdfY1X[i, j+1]+pdfY1X[i+1, j+1])/4 pY1XSum += pY1X*squareArea pYminus1X = (pdfYminus1X[i,j]+pdfYminus1X[i+1,j]+pdfYminus1X[i, j+1]+pdfYminus1X[i+1, j+1])/4 pYminus1XSum += pYminus1X*squareArea px2Sum += px*pY1X*squareArea + px*pYminus1X*squareArea self.assertAlmostEquals(pxSum, 1) print(pY1XSum) print(pYminus1XSum) self.assertAlmostEquals(px2Sum, 1)
def testComputeIdealPenalty(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] sampleSize = 100 trainX, trainY = X[0:sampleSize, :], y[0:sampleSize] testX, testY = X[sampleSize:, :], y[sampleSize:] #We form a test set from the grid points fullX = numpy.zeros((gridPoints.shape[0]**2, 2)) for m in range(gridPoints.shape[0]): fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m] C = 1.0 gamma = 1.0 args = (trainX, trainY, fullX, C, gamma, gridPoints, pdfX, pdfY1X, pdfYminus1X) penalty = computeIdealPenalty(args) #Now compute penalty using data args = (trainX, trainY, testX, testY, C, gamma) penalty2 = computeIdealPenalty2(args) self.assertAlmostEquals(penalty2, penalty, 2)
def __init__(self, cmdLine=None, defaultAlgoArgs = None, dirName=""): """ priority for default args - best priority: command-line value - middle priority: set-by-function value - lower priority: class value """ # Parameters to choose which methods to run # Obtained merging default parameters from the class with those from the user self.algoArgs = RankingExpHelper.newAlgoParams(defaultAlgoArgs) self.ps = [1, 3, 5] #The max number of observations to use for model selection self.sampleSize = 5*10**6 # basic resultsDir self.resultsDir = PathDefaults.getOutputDir() + "ranking/" + dirName + "/" #Create the results dir if it does not exist # os.makedirs(resultsDir, exist_ok=True) # for python 3.2 try: os.makedirs(self.resultsDir) except OSError as err: if err.errno != errno.EEXIST: raise # update algoParams from command line self.readAlgoParams(cmdLine) #Sometimes there are problems with multiprocessing, so this fixes the issues os.system('taskset -p 0xffffffff %d' % os.getpid())
def main(argv=None): if argv is None: argv = sys.argv try: # read options try: opts, args = getopt.getopt(argv[1:], "hd:n:D", ["help", "dir=", "nb_user="******"debug"]) except getopt.error as msg: raise RGUsage(msg) # apply options dir = PathDefaults.getDataDir() + "cluster/" nb_user = None log_level = logging.INFO for o, a in opts: if o in ("-h", "--help"): print(__doc__) return 0 elif o in ("-d", "--dir"): dir = a elif o in ("-n", "--nb_user"): nb_user = int(a) elif o in ("-D", "--debug"): log_level = logging.DEBUG logging.basicConfig(stream=sys.stdout, level=log_level, format='%(levelname)s (%(asctime)s):%(message)s') # process: generate data files BemolData.generate_data_file(dir, nb_user) except RGUsage as err: logging.error(err.msg) logging.error("for help use --help") return 2
def getIterator(): dataDir = PathDefaults.getDataDir() + "cluster/" nbUser = 10000 # set to 'None' to have all users nbPurchasesPerIt = 500 # set to 'None' to take all the purchases per date startingIteration = 300 endingIteration = 600 # set to 'None' to have all iterations stepSize = 1 return itertools.islice(BemolData.getGraphIterator(dataDir, nbUser, nbPurchasesPerIt), startingIteration, endingIteration, stepSize)
def profileClusterFromIterator(self): iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList) dataDir = PathDefaults.getDataDir() + "cluster/" #iterator = getBemolGraphIterator(dataDir) def run(): clusterList, timeList, boundList = self.clusterer.clusterFromIterator(iterator, verbose=True) print(timeList.cumsum(0)) ProfileUtils.profile('run()', globals(), locals())
def syntheticDataset2(): """ Create a simple synthetic dataset using a power law distribution on users and items """ resultsDir = PathDefaults.getDataDir() + "syntheticRanking/" matrixFileName = resultsDir + "dataset1.mtx" X = sppy.io.mmread(matrixFileName, storagetype="row") return X
def testGetTrainIteratorFunc(self): dataFilename = PathDefaults.getDataDir() + "reference/author_document_count" dataset = Static2IdValDataset(dataFilename) trainIterator = dataset.getTrainIteratorFunc()() testIterator = dataset.getTestIteratorFunc()() for trainX in trainIterator: testX = testIterator.next() print(trainX.shape, trainX.nnz, testX.nnz) self.assertEquals(trainX.shape, testX.shape)
def mendeley2(minNnzRows=10, minNnzCols=2, quantile=90, dataset="Document"): authorAuthorFileName = PathDefaults.getDataDir() + "reference/author" + dataset + "Matrix.mtx" logging.debug("Reading file: " + authorAuthorFileName) X = sppy.io.mmread(authorAuthorFileName, storagetype="row") logging.debug("Raw non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + authorAuthorFileName) logging.debug("Non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) return X
def processParkinsonsDataset(name, numRealisations): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" fileName = dataDir + name + ".data" XY = numpy.loadtxt(fileName, delimiter=",", skiprows=1) inds = list(set(range(XY.shape[1])) - set([5, 6])) X = XY[:, inds] y1 = XY[:, 5] y2 = XY[:, 6] #We don't keep whole collections of patients split = 0.5 idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-motor/" preprocessSave(X, y1, outputDir, idx) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-total/" preprocessSave(X, y2, outputDir, idx)
def profileSvd2(self): dataDir = PathDefaults.getDataDir() + "erasm/contacts/" trainFilename = dataDir + "contacts_train" trainX = scipy.io.mmread(trainFilename) trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8) k = 500 U, s, V = RandomisedSVD.svd(trainX, k) print(s) print("All done")
def profileClusterFromIterator(self): iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList) dataDir = PathDefaults.getDataDir() + "cluster/" #iterator = getBemolGraphIterator(dataDir) def run(): clusterList, timeList, boundList = self.clusterer.clusterFromIterator( iterator, verbose=True) print(timeList.cumsum(0)) ProfileUtils.profile('run()', globals(), locals())
def testBayesError(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] sampleSize = 100 trainX, trainY = X[0:sampleSize, :], y[0:sampleSize] testX, testY = X[sampleSize:, :], y[sampleSize:] #We form a test set from the grid points gridX = numpy.zeros((gridPoints.shape[0]**2, 2)) for m in range(gridPoints.shape[0]): gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m] Cs = 2**numpy.arange(-5, 5, dtype=numpy.float) gammas = 2**numpy.arange(-5, 5, dtype=numpy.float) bestError = 1 for C in Cs: for gamma in gammas: svm = LibSVM(kernel="gaussian", C=C, kernelParam=gamma) svm.learnModel(trainX, trainY) predY, decisionsY = svm.predict(gridX, True) decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F") error = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X) predY, decisionsY = svm.predict(testX, True) error2 = Evaluator.binaryError(testY, predY) print(error, error2) if error < bestError: error = bestError bestC = C bestGamma = gamma svm = LibSVM(kernel="gaussian", C=bestC, kernelParam=bestGamma) svm.learnModel(trainX, trainY) predY, decisionsY = svm.predict(gridX, True) plt.figure(0) plt.contourf(gridPoints, gridPoints, decisionGrid, 100) plt.colorbar() plt.figure(1) plt.scatter(X[y==1, 0], X[y==1, 1], c='r' ,label="-1") plt.scatter(X[y==-1, 0], X[y==-1, 1], c='b',label="+1") plt.legend() plt.show()
def profilePropackSvd(self): dataDir = PathDefaults.getDataDir() + "erasm/contacts/" trainFilename = dataDir + "contacts_train" trainX = scipy.io.mmread(trainFilename) trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8) k = 500 U, s, V = SparseUtils.svdPropack(trainX, k, kmax=k * 5) print(s) #Memory consumption is dependent on kmax print("All done")
def loadParams(ind): if processReal: resultsDir = PathDefaults.getOutputDir() + "viroscopy/real/theta" + str(ind) + "/" outputDir = resultsDir + "stats/" N, matchAlpha, breakScale, numEpsilons, epsilon, minEpsilon, matchAlg, abcMaxRuns, batchSize, pertScale = HIVModelUtils.realABCParams(True) startDate, endDate, recordStep, M, targetGraph, numInds = HIVModelUtils.realSimulationParams(test=True, ind=ind) realTheta, sigmaTheta, pertTheta = HIVModelUtils.estimatedRealTheta(ind) numInds=2 prefix = "Real" else: resultsDir = PathDefaults.getOutputDir() + "viroscopy/toy/theta/" outputDir = resultsDir + "stats/" N, matchAlpha, breakScale, numEpsilons, epsilon, minEpsilon, matchAlg, abcMaxRuns, batchSize, pertScale = HIVModelUtils.toyABCParams() startDate, endDate, recordStep, M, targetGraph = HIVModelUtils.toySimulationParams(test=True) realTheta, sigmaTheta, pertTheta = HIVModelUtils.toyTheta() prefix = "Toy" numInds = 1 breakSize = (targetGraph.subgraph(targetGraph.removedIndsAt(endDate)).size - targetGraph.subgraph(targetGraph.removedIndsAt(startDate)).size) * breakScale return N, resultsDir, outputDir, recordStep, startDate, endDate, prefix, targetGraph, breakSize, numEpsilons, M, matchAlpha, matchAlg, numInds
def profilePropackSvd(self): dataDir = PathDefaults.getDataDir() + "erasm/contacts/" trainFilename = dataDir + "contacts_train" trainX = scipy.io.mmread(trainFilename) trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8) k = 500 U, s, V = SparseUtils.svdPropack(trainX, k, kmax=k * 5) print(s) # Memory consumption is dependent on kmax print("All done")
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] * 2 - 1 fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] * 2 - 1 #X = Standardiser().standardiseArray(X) #testX = Standardiser().standardiseArray(testX) maxDepths = range(3, 10) trainAucs = numpy.array([ 0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508 ]) testAucs = numpy.array([ 0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400 ]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRank = TreeRank(DecisionTree) treeRank.setMaxDepth(maxDepth) treeRank.learnModel(X, y) trainScores = treeRank.predict(X) testScores = treeRank.predict(testX) #print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY)) #self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2) #self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i += 1 #Compare tree to that of R version tree = treeRank.getTree()
def testSaveParams(self): try: lmbda = 0.01 alterRegressor = PrimalRidgeRegression(lmbda) egoRegressor = PrimalRidgeRegression(lmbda) predictor = EgoEdgeLabelPredictor(alterRegressor, egoRegressor) params = [0.1, 0.2] paramFuncs = [egoRegressor.setLambda, alterRegressor.setLambda] fileName = PathDefaults.getTempDir() + "tempParams.pkl" predictor.saveParams(params, paramFuncs, fileName) except IOError as e: logging.warn(e)
def profileArpackSvd(self): dataDir = PathDefaults.getDataDir() + "erasm/contacts/" trainFilename = dataDir + "contacts_train" trainX = scipy.io.mmread(trainFilename) trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.float32) print(trainX.dtype.char, trainX.dtype) k = 500 U, s, V = SparseUtils.svdArpack(trainX, k, kmax=k * 5) print(s) # Memory consumption is dependent on kmax and less than PROPACK print("All done")
def profileArpackSvd(self): dataDir = PathDefaults.getDataDir() + "erasm/contacts/" trainFilename = dataDir + "contacts_train" trainX = scipy.io.mmread(trainFilename) trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.float32) print(trainX.dtype.char, trainX.dtype) k = 500 U, s, V = SparseUtils.svdArpack(trainX, k, kmax=k * 5) print(s) #Memory consumption is dependent on kmax and less than PROPACK print("All done")
def movieLens(minNnzRows=10, minNnzCols=2, quantile=90): matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" data = numpy.loadtxt(matrixFileName) X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row", dtype=numpy.int) X.put(numpy.array(data[:, 2]>3, numpy.int), numpy.array(data[:, 0]-1, numpy.int32), numpy.array(data[:, 1]-1, numpy.int32), init=True) #X = SparseUtilsCython.centerRowsCsarray(X) #X[X.nonzero()] = X.values()>0 X.prune() #maxNnz = numpy.percentile(X.sum(0), quantile) #X = SparseUtils.pruneMatrixCols(X, minNnz=minNnzCols, maxNnz=maxNnz) X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) return X
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] y = y*2 - 1 fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] testY = testY*2-1 X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) numTrees = 5 minSplit = 50 maxDepths = range(3, 10) trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347]) testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRankForest = TreeRankForest(self.leafRanklearner) treeRankForest.setMaxDepth(maxDepth) treeRankForest.setMinSplit(minSplit) treeRankForest.setNumTrees(numTrees) treeRankForest.learnModel(X, y) trainScores = treeRankForest.predict(X) testScores = treeRankForest.predict(testX) print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY)) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i+=1
def profile(command, globalVars, localVars, numStats=30): """ Just profile the given command with the global and local variables and print out the cumulative and function times. """ try: import pstats import cProfile except ImportError: raise ImportError("profile() requires pstats and cProfile") tempDirectory = PathDefaults.getTempDir() profileFileName = tempDirectory + "profile.cprof" logging.info("Starting to profile ...") cProfile.runctx(command, globalVars, localVars, profileFileName) logging.info("Done") stats = pstats.Stats(profileFileName) stats.strip_dirs().sort_stats("cumulative").print_stats(numStats) stats.strip_dirs().sort_stats("time").print_stats(numStats)
def testPredict2(self): #We play around with parameters to maximise AUC on the IGF1_0-Haar data dataDir = PathDefaults.getDataDir() fileName = dataDir + "IGF1_0-Haar.npy" XY = numpy.load(fileName) X = XY[:, 0:XY.shape[1]-1] y = XY[:, XY.shape[1]-1].ravel() weight = numpy.bincount(numpy.array(y, numpy.int))[0]/float(y.shape[0]) #weight = 0.5 #weight = 0.9 folds = 3 decisionTree = DecisionTree() decisionTree.setWeight(weight) decisionTree.setMaxDepth(50) #decisionTree.setMinSplit(100) mean, var = decisionTree.evaluateCv(X, y, folds, Evaluator.auc) logging.debug("AUC = " + str(mean)) logging.debug("Var = " + str(var))
def main(): import sys logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" data = numpy.loadtxt(matrixFileName) X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row") X[data[:, 0] - 1, data[:, 1] - 1] = numpy.array(data[:, 2] > 3, numpy.int) logging.debug("Read file: " + matrixFileName) logging.debug("Shape of data: " + str(X.shape)) logging.debug("Number of non zeros " + str(X.nnz)) u = 0.1 w = 1 - u (m, n) = X.shape validationSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize) trainX, testX = trainTestXs[0] trainX = trainX.toScipyCsr() learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001) learner.learnModel(trainX)
logging.debug('process id:' + str(os.getpid())) self.saveResults(self.leafRankGenerators, True) def run2(self): logging.debug('module name:' + __name__) logging.debug('parent process:' + str(os.getppid())) logging.debug('process id:' + str(os.getpid())) self.saveResults(self.funcLeafRankGenerators, False) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.debug("Running from machine " + str(gethostname())) numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "metabolomic/" X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData() waveletStr = 'db4' mode = "cpd" level = 10 XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, 'db4', level, mode) XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, 'db8', level, mode) XwHaar = MetabolomicsUtils.getWaveletFeatures(X, 'haar', level, mode) dataList = [] dataList.extend([(XwDb4, "db4")]) lock = multiprocessing.Lock() numpy.random.seed(datetime.datetime.now().microsecond)