def saveRatingMatrix(): """ Take the coauthor graph above and make vertices indexed from 0 then save as matrix market format. """ edgeFileName = PathDefaults.getOutputDir() + "erasm/edges2.txt" logging.debug("Reading edge list") edges = numpy.loadtxt(edgeFileName, delimiter=",", dtype=numpy.int) logging.debug("Total number of edges: " + str(edges.shape[0])) vertexIdDict = {} vertexIdSet = set([]) i = 0 for edge in edges: if edge[0] not in vertexIdSet: vertexIdDict[edge[0]] = i vertexIdSet.add(edge[0]) i += 1 if edge[1] not in vertexIdSet: vertexIdDict[edge[1]] = i vertexIdSet.add(edge[1]) i += 1 n = len(vertexIdDict) R = scipy.sparse.lil_matrix((n, n)) logging.debug("Creating sparse matrix") for edge in edges: R[vertexIdDict[edge[0]], vertexIdDict[edge[1]]] += 1 R[vertexIdDict[edge[1]], vertexIdDict[edge[0]]] += 1 logging.debug("Created matrix " + str(R.shape) + " with " + str(R.getnnz()) + " non zeros") R = R.tocsr() minCoauthors = 20 logging.debug("Removing vertices with <" + str(minCoauthors) + " coauthors") nonzeros = R.nonzero() inds = numpy.arange(nonzeros[0].shape[0])[numpy.bincount(nonzeros[0]) >= minCoauthors] R = R[inds, :][:, inds] logging.debug("Matrix has shape " + str(R.shape) + " with " + str(R.getnnz()) + " non zeros") matrixFileName = PathDefaults.getOutputDir() + "erasm/R" scipy.io.mmwrite(matrixFileName, R) logging.debug("Wrote matrix to file " + matrixFileName)
def __init__(self, maxIter=None, iterStartTimeStamp=None): outputDir = PathDefaults.getOutputDir() + "recommend/erasm/" if not os.path.exists(outputDir): os.mkdir(outputDir) #iterStartDate is the starting date of the iterator if iterStartTimeStamp != None: self.iterStartTimeStamp = iterStartTimeStamp else: self.iterStartTimeStamp = 1286229600 self.timeStep = timedelta(30).total_seconds() self.ratingFileName = outputDir + "data.npz" self.userDictFileName = outputDir + "userIdDict.pkl" self.groupDictFileName = outputDir + "groupIdDict.pkl" self.isTrainRatingsFileName = outputDir + "is_train.npz" self.dataDir = PathDefaults.getDataDir() + "erasm/" self.dataFileName = self.dataDir + "groupMembers-29-11-12" self.maxIter = maxIter self.trainSplit = 4.0/5 self.processRatings() self.splitDataset() self.loadProcessedData()
def __init__(self, maxIter=None, iterStartTimeStamp=None): """ Return a training and test set for movielens based on the time each rating was made. """ self.timeStep = timedelta(30).total_seconds() # iterStartDate is the starting date of the iterator if iterStartTimeStamp != None: self.iterStartTimeStamp = iterStartTimeStamp else: self.iterStartTimeStamp = 789652009 outputDir = PathDefaults.getOutputDir() + "recommend/erasm/" self.numRatings = 402872 self.minContacts = 10 if not os.path.exists(outputDir): os.mkdir(outputDir) self.ratingFileName = outputDir + "data.npz" self.userDictFileName = outputDir + "userIdDict.pkl" self.isTrainRatingsFileName = outputDir + "is_train.npz" self.maxIter = maxIter self.trainSplit = 4.0 / 5 self.processRatings() self.splitDataset() self.loadProcessedData() if self.maxIter != None: logging.debug("Maximum number of iterations: " + str(self.maxIter))
def testWriteToFile3(self): """ We will test out writing out some random graphs to Pajek """ numVertices = 20 numFeatures = 0 vList = VertexList(numVertices, numFeatures) graph = SparseGraph(vList) p = 0.1 generator = ErdosRenyiGenerator(p) graph = generator.generate(graph) pw = PajekWriter() directory = PathDefaults.getOutputDir() + "test/" pw.writeToFile(directory + "erdosRenyi20", graph) #Now write a small world graph p = 0.2 k = 3 graph.removeAllEdges() generator = SmallWorldGenerator(p, k) graph = generator.generate(graph) pw.writeToFile(directory + "smallWorld20", graph)
def __init__(self, trainXIteratorFunc, testXIteratorFunc, cmdLine=None, defaultAlgoArgs = None, dirName=""): """ priority for default args - best priority: command-line value - middle priority: set-by-function value - lower priority: class value """ # Parameters to choose which methods to run # Obtained merging default parameters from the class with those from the user self.algoArgs = RecommendExpHelper.newAlgoParams(defaultAlgoArgs) #Function to return iterators to the training and test matrices self.trainXIteratorFunc = trainXIteratorFunc self.testXIteratorFunc = testXIteratorFunc #How often to print output self.logStep = 10 #The max number of observations to use for model selection self.sampleSize = 5*10**6 # basic resultsDir self.resultsDir = PathDefaults.getOutputDir() + "recommend/" + dirName + "/" # update algoParams from command line self.readAlgoParams(cmdLine)
def __init__(self, YList, X, featuresName, ages, args): super(MetabolomicsExpRunner, self).__init__(args=args) self.X = X self.YList = YList #The list of concentrations self.featuresName = featuresName self.args = args self.ages = ages self.maxDepth = 10 self.numTrees = 10 self.sampleSize = 1.0 self.sampleReplace = True self.folds = 5 self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/" self.leafRankGenerators = [] self.leafRankGenerators.append((LinearSvmGS.generate(), "SVM")) self.leafRankGenerators.append((SvcGS.generate(), "RBF-SVM")) self.leafRankGenerators.append((DecisionTree.generate(), "CART")) self.pcaLeafRankGenerators = [(LinearSvmPca.generate(), "LinearSVM-PCA")] self.funcLeafRankGenerators = [] self.funcLeafRankGenerators.append((LinearSvmFGs.generate, "SVMF")) self.funcLeafRankGenerators.append((SvcFGs.generate, "RBF-SVMF")) self.funcLeafRankGenerators.append((DecisionTreeF.generate, "CARTF")) #Store all the label vectors and their missing values YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList) self.hormoneInds = [YIgf1Inds, YICortisolInds, YTestoInds] self.hormoneNames = MetabolomicsUtils.getLabelNames()
def testWriteToFile(self): sgw = SimpleGraphWriter() directory = PathDefaults.getOutputDir() + "test/" #Have to check the files fileName1 = directory + "dictTestUndirected" sgw.writeToFile(fileName1, self.dctGraph1) fileName2 = directory + "dictTestDirected" sgw.writeToFile(fileName2, self.dctGraph2)
def getOutputFileName(graphType, p, k, infoProb): outputDirectory = PathDefaults.getOutputDir() if graphType == "SmallWorld": outputFileName = outputDirectory + "SvmEgoOutput_type=" + graphType + "_p=" + str(p) + "_k=" + str(k) + "_q=" + str(infoProb) elif graphType == "ErdosRenyi": outputFileName = outputDirectory + "SvmEgoOutput_type=" + graphType + "_p=" + str(p) + "_q=" + str(infoProb) else: raise ValueError("Invalid graph type: " + graphType) return outputFileName
def __init__(self, df, X, featuresName, ages, args): super(MetabolomicsRegExpRunner, self).__init__(args=args) self.df = df self.X = X self.featuresName = featuresName self.args = args self.ages = ages self.labelNames = MetabolomicsUtils.getLabelNames() self.YList = MetabolomicsUtils.createLabelList(df, self.labelNames) self.boundsList = MetabolomicsUtils.getBounds() self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/"
def loadParams(ind): if processReal: resultsDir = PathDefaults.getOutputDir() + "viroscopy/real/theta" + str(ind) + "/" outputDir = resultsDir + "stats/" N, matchAlpha, breakScale, numEpsilons, epsilon, minEpsilon, matchAlg, abcMaxRuns, batchSize, pertScale = HIVModelUtils.realABCParams(True) startDate, endDate, recordStep, M, targetGraph, numInds = HIVModelUtils.realSimulationParams(test=True, ind=ind) realTheta, sigmaTheta, pertTheta = HIVModelUtils.estimatedRealTheta(ind) numInds=2 prefix = "Real" else: resultsDir = PathDefaults.getOutputDir() + "viroscopy/toy/theta/" outputDir = resultsDir + "stats/" N, matchAlpha, breakScale, numEpsilons, epsilon, minEpsilon, matchAlg, abcMaxRuns, batchSize, pertScale = HIVModelUtils.toyABCParams() startDate, endDate, recordStep, M, targetGraph = HIVModelUtils.toySimulationParams(test=True) realTheta, sigmaTheta, pertTheta = HIVModelUtils.toyTheta() prefix = "Toy" numInds = 1 breakSize = (targetGraph.subgraph(targetGraph.removedIndsAt(endDate)).size - targetGraph.subgraph(targetGraph.removedIndsAt(startDate)).size) * breakScale return N, resultsDir, outputDir, recordStep, startDate, endDate, prefix, targetGraph, breakSize, numEpsilons, M, matchAlpha, matchAlg, numInds
def testWriteToFile(self): graph = DictGraph() numVertices = 5 numFeatures = 3 V = numpy.random.rand(numVertices, numFeatures) for i in range(0, numVertices): graph.setVertex(i, V[i, :]) fileName = PathDefaults.getOutputDir() + "test/vertices" verterWriter = CsvVertexWriter() verterWriter.writeToFile(fileName, graph) logging.debug(V)
def __init__(self, iteratorFunc, cmdLine=None, defaultAlgoArgs = None, dirName=""): # Parameters to choose which methods to run # Obtained merging default parameters from the class with those from the user self.algoArgs = ClusterExpHelper.newAlgoParams(defaultAlgoArgs) # Variables related to the dataset self.getIteratorFunc = iteratorFunc #How often to print output self.logStep = 10 # basic resultsDir self.resultsDir = PathDefaults.getOutputDir() + "cluster/" + dirName + "/" # update algoParams from command line self.readAlgoParams(cmdLine)
def __init__(self, maxIter=None, iterStartTimeStamp=None): """ Return a training and test set for netflix based on the time each rating was made. There are 62 iterations. """ self.timeStep = timedelta(30).total_seconds() #startDate is used to convert dates into ints #self.startDate = datetime(1998,1,1) #self.endDate = datetime(2005,12,31) #iterStartDate is the starting date of the iterator if iterStartTimeStamp != None: self.iterStartTimeStamp = iterStartTimeStamp else: self.iterStartTimeStamp = time.mktime(datetime(2001,1,1).timetuple()) self.startMovieID = 1 self.endMovieID = 17770 self.numMovies = 17770 self.numRatings = 100480507 self.numProbeMovies = 16938 self.numProbeRatings = 1408395 self.numCustomers = 480189 outputDir = PathDefaults.getOutputDir() + "recommend/netflix/" if not os.path.exists(outputDir): os.mkdir(outputDir) self.ratingFileName = outputDir + "data.npz" self.custDictFileName = outputDir + "custIdDict.pkl" self.probeFileName = PathDefaults.getDataDir() + "netflix/probe.txt" self.testRatingsFileName = outputDir + "test_data.npz" self.isTrainRatingsFileName = outputDir + "is_train.npz" self.maxIter = maxIter self.trainSplit = 4.0/5 self.processRatings() #self.processProbe() self.splitDataset() self.loadProcessedData() if self.maxIter != None: logging.debug("Maximum number of iterations: " + str(self.maxIter))
def recommend(learner): """ Take a list of coauthors and read in the complete graph into a sparse matrix X such that X_ij = k means author i has worked with j, k times. Then do matrix factorisation on the resulting methods. """ outputDir = PathDefaults.getOutputDir() + "erasm/" matrixFileName = outputDir + "Toy" numExamples = 50 numFolds = 5 X = scipy.io.mmread(matrixFileName) X = scipy.sparse.csr_matrix(X) logging.debug("Loaded matrix " + str(X.shape) + " with " + str(X.getnnz()) + " non zeros") X = X.tocsr() X = X[0:numExamples ,:] X, maxS = preprocess(X) #Take out some ratings to form a training set rowInds, colInds = X.nonzero() randInds = numpy.random.permutation(rowInds.shape[0]) indexList = Sampling.crossValidation(numFolds, rowInds.shape[0]) paramList = [] for j, (trnIdx, tstIdx) in enumerate(indexList): trainInds = randInds[trnIdx] testInds = randInds[tstIdx] trainX = SparseUtils.selectMatrix(X, rowInds[trainInds], colInds[trainInds]).tocsr() testX = SparseUtils.selectMatrix(X, rowInds[testInds], colInds[testInds]).tocsr() paramList.append((trainX, testX, learner)) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) results = pool.map(computeTestError, paramList) #results = map(computeTestError, paramList) testErrors = numpy.array(results) meanTestErrors = testErrors.mean() logging.debug("Test errors = " + str(meanTestErrors)) errorFileName = outputDir + "results_" + learner.name() numpy.savez(errorFileName, meanTestErrors) logging.debug("Saved results as " + errorFileName)
def computeLearningRates(datasetNames, numProcesses, fileNameSuffix, learnerName, sampleSizes, foldsSet): dataDir = PathDefaults.getDataDir() + "modelPenalisation/" outputDir = PathDefaults.getOutputDir() + "modelPenalisation/" learner, loadMethod, dataDir, outputDir, paramDict = getSetup(learnerName, dataDir, outputDir, numProcesses) for i in range(len(datasetNames)): logging.debug("Learning using dataset " + datasetNames[i][0]) outfileName = outputDir + datasetNames[i][0] + fileNameSuffix fileLock = FileLock(outfileName + ".npz") if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() numRealisations = datasetNames[i][1] gridShape = [numRealisations, sampleSizes.shape[0]] gridShape.extend(list(learner.gridShape(paramDict))) gridShape = tuple(gridShape) betaGrids = numpy.zeros(gridShape) for k in range(sampleSizes.shape[0]): sampleSize = sampleSizes[k] logging.debug("Using sample size " + str(sampleSize)) for j in range(numRealisations): Util.printIteration(j, 1, numRealisations, "Realisation: ") trainX, trainY, testX, testY = loadMethod(dataDir, datasetNames[i][0], j) numpy.random.seed(21) trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize] validX = trainX[trainInds,:] validY = trainY[trainInds] betaGrids[j, k, :] = learner.learningRate(validX, validY, foldsSet, paramDict) numpy.savez(outfileName, betaGrids) logging.debug("Saved results as file " + outfileName + ".npz") fileLock.unlock()
def testWriteToFile(self): pw = PajekWriter() directory = PathDefaults.getOutputDir() + "test/" #Have to check the files fileName1 = directory + "denseTestUndirected" pw.writeToFile(fileName1, self.dGraph1) fileName2 = directory + "denseTestDirected" pw.writeToFile(fileName2, self.dGraph2) fileName3 = directory + "sparseTestUndirected" pw.writeToFile(fileName3, self.sGraph1) fileName4 = directory + "sparseTestDirected" pw.writeToFile(fileName4, self.sGraph2) fileName5 = directory + "dictTestUndirected" pw.writeToFile(fileName5, self.dctGraph1) fileName6 = directory + "dictTestDirected" pw.writeToFile(fileName6, self.dctGraph2)
def testWriteToFile2(self): pw = PajekWriter() directory = PathDefaults.getOutputDir() + "test/" def setVertexColour(vertexIndex, graph): colours = ["grey05", "grey10", "grey15", "grey20", "grey25"] return colours[vertexIndex] def setVertexSize(vertexIndex, graph): return vertexIndex def setEdgeColour(vertexIndex1, vertexIndex2, graph): colours = ["grey05", "grey10", "grey15", "grey20", "grey25"] return colours[vertexIndex1] def setEdgeSize(vertexIndex1, vertexIndex2, graph): return vertexIndex1 + vertexIndex2 pw.setVertexColourFunction(setVertexColour) fileName1 = directory + "vertexColourTest" pw.writeToFile(fileName1, self.dGraph1) pw.setVertexColourFunction(None) pw.setVertexSizeFunction(setVertexSize) fileName1 = directory + "vertexSizeTest" pw.writeToFile(fileName1, self.dGraph1) pw.setVertexSizeFunction(None) pw.setEdgeColourFunction(setEdgeColour) fileName1 = directory + "edgeColourTest" pw.writeToFile(fileName1, self.dGraph1) pw.setEdgeColourFunction(None) pw.setEdgeSizeFunction(setEdgeSize) fileName1 = directory + "edgeSizeTest" pw.writeToFile(fileName1, self.dGraph1) pw.setEdgeColourFunction(None)
def testWriteToFile2(self): pw = PajekWriter() directory = PathDefaults.getOutputDir() + "test/" def setVertexColour(vertexIndex, graph): colours = ["grey05", "grey10", "grey15", "grey20", "grey25"] return colours[vertexIndex] def setVertexSize(vertexIndex, graph): return vertexIndex def setEdgeColour(vertexIndex1, vertexIndex2, graph): colours = ["grey05", "grey10", "grey15", "grey20", "grey25"] return colours[vertexIndex1] def setEdgeSize(vertexIndex1, vertexIndex2, graph): return vertexIndex1+vertexIndex2 pw.setVertexColourFunction(setVertexColour) fileName1 = directory + "vertexColourTest" pw.writeToFile(fileName1, self.dGraph1) pw.setVertexColourFunction(None) pw.setVertexSizeFunction(setVertexSize) fileName1 = directory + "vertexSizeTest" pw.writeToFile(fileName1, self.dGraph1) pw.setVertexSizeFunction(None) pw.setEdgeColourFunction(setEdgeColour) fileName1 = directory + "edgeColourTest" pw.writeToFile(fileName1, self.dGraph1) pw.setEdgeColourFunction(None) pw.setEdgeSizeFunction(setEdgeSize) fileName1 = directory + "edgeSizeTest" pw.writeToFile(fileName1, self.dGraph1) pw.setEdgeColourFunction(None)
def __init__(self, maxIter=None, iterStartTimeStamp=None): """ Return a training and test set for itemlens based on the time each rating was made. """ self.timeStep = timedelta(30).total_seconds() #iterStartDate is the starting date of the iterator if iterStartTimeStamp != None: self.iterStartTimeStamp = iterStartTimeStamp else: self.iterStartTimeStamp = time.mktime(datetime(2009,1,1).timetuple()) self.numItems = 1560144 #It says 13668319 on the site but that seems to be wrong self.numRatings = 8196072 self.numCustomers = 71567 outputDir = PathDefaults.getOutputDir() + "recommend/Flixster/" if not os.path.exists(outputDir): os.mkdir(outputDir) self.ratingFileName = outputDir + "data.npz" self.custDictFileName = outputDir + "custIdDict.pkl" self.itemDictFileName = outputDir + "itemIdDict.pkl" self.isTrainRatingsFileName = outputDir + "is_train.npz" self.maxIter = maxIter self.trainSplit = 4.0/5 self.processRatings() self.splitDataset() self.loadProcessedData() if self.maxIter != None: logging.debug("Maximum number of iterations: " + str(self.maxIter))
plt.xlabel("log(t)") plt.ylabel('Error') plt.legend(loc="lower left") plt.show() showCART = True showSVR = False from itertools import cycle lines = ["k-","k--","k-.","k:","k-x", "k-+"] linecycler = cycle(lines) if showSVR: outputDir = PathDefaults.getOutputDir() + "modelPenalisation/regression/SVR/" sampleSizes = numpy.array([50, 100, 200]) sampleMethods = ["CV"] cvScalings = numpy.arange(0.6, 1.61, 0.2) foldsSet = numpy.arange(2, 13, 2) datasetNames = ModelSelectUtils.getRegressionDatasets() fileNameSuffix = 'Results' summary(datasetNames, sampleSizes, foldsSet, cvScalings, sampleMethods, fileNameSuffix) plotDatasetNames = [datasetNames[7]] plotAlphas(plotDatasetNames, sampleSizes, foldsSet, cvScalings, sampleMethods, fileNameSuffix) sampleSizes = numpy.array([25, 50, 100]) sampleMethods = ["CV"] cvScalings = numpy.arange(0.6, 1.61, 0.2)
else: numProcesses = multiprocessing.cpu_count() if len(sys.argv) > 2: i = int(sys.argv[2]) else: i = 0 FORMAT = "%(levelname)s:root:%(process)d:%(message)s" logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format=FORMAT) logging.debug("Number of processes: " + str(numProcesses)) logging.debug("Epidemic period index " + str(i)) numpy.set_printoptions(suppress=True, precision=4, linewidth=150) numpy.seterr(invalid='raise') resultsDir = PathDefaults.getOutputDir() + "viroscopy/real/" startDate, endDate, recordStep, M, targetGraph, numInds = HIVModelUtils.realSimulationParams(ind=i) N, matchAlpha, breakScale, numEpsilons, epsilon, minEpsilon, matchAlg, abcMaxRuns, batchSize, pertScale = HIVModelUtils.realABCParams(i) logging.debug("Posterior sample size " + str(N)) logging.debug("Matching algorithm " + str(matchAlg)) logging.debug("="*10 + "Starting new simulation batch with index " + str(i) + "="*10) logging.debug("Total time of simulation is " + str(endDate-startDate)) breakSize = (targetGraph.subgraph(targetGraph.removedIndsAt(endDate)).size - targetGraph.subgraph(targetGraph.removedIndsAt(startDate)).size) * breakScale logging.debug("Largest acceptable graph is " + str(breakSize)) def createModel(t): """ The parameter t is the particle index.
k = 4 numGraphs = 100 #numGraphs = 20 nystromNs = [900] randSVDVecs = [100, 900] IASCL = [k, 300] # more than k is mostly useless (except l=graphSize): a priori, all the remaining directions are equivalent for the noise. So to catch changes implied by noise we have to keep all the directions. numClusterVertices = 250 numMethods = len(nystromNs) + len(randSVDVecs) + len(IASCL) + 3 errors = numpy.zeros((numGraphs, numMethods)) numRepetitions = 20 #numRepetitions = 1 saveResults = False resultsDir = PathDefaults.getOutputDir() + "cluster/" fileName = resultsDir + "ErrorBoundNystrom.npy" if saveResults: for r in range(numRepetitions): i = 0 iterator = BoundGraphIterator(changeEdges=50, numGraphs=numGraphs, numClusterVertices=numClusterVertices, numClusters=k, p=0.1) for W in iterator: print("i="+str(i)) L = GraphUtils.shiftLaplacian(W) if i == 0: initialL = L initialOmega, initialQ = numpy.linalg.eigh(L.todense()) inds = numpy.flipud(numpy.argsort(initialOmega))
from matplotlib import rc rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) rc('text', usetex=True) from apgl.util.PathDefaults import PathDefaults logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) #For now just print some results for a particular dataset #dataset = "MovieLensDataset" dataset = "NetflixDataset" #dataset = "FlixsterDataset" #dataset = "SyntheticDataset1" #dataset = "EpinionsDataset" outputDir = PathDefaults.getOutputDir() + "recommend/" + dataset + "/" plotStyles = ['k-', 'k--', 'k-.', 'r--', 'r-', 'g-', 'b-', 'b--', 'b-.', 'g--', 'g--', 'g-.', 'r-', 'r--', 'r-.'] methods = ["propack", "arpack", "rsvd", "rsvdUpdate2"] updateAlgs = ["initial", "zero"] #pq = [(10, 2), (50, 2), (10, 5)] pq = [(10, 3), (50, 2), (50, 3)] #fileNames = [outputDir + "ResultsSgdMf.npz"] #labels = ["SgdMf"] fileNames = [] labels = [] consise = True for method in methods:
from apgl.viroscopy.HIVGraphReader import HIVGraphReader """ This script computes some basic statistics on the growing infection graph. """ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) numpy.set_printoptions(suppress=True, linewidth=100, precision=3) undirected = False hivReader = HIVGraphReader() graph = hivReader.readHIVGraph(undirected, indicators=False) fInds = hivReader.getNonIndicatorFeatureIndices() figureDir = PathDefaults.getOutputDir() + "viroscopy/figures/infect/" resultsDir = PathDefaults.getOutputDir() + "viroscopy/" #The set of edges indexed by zeros is the contact graph #The ones indexed by 1 is the infection graph edgeTypeIndex1 = 0 edgeTypeIndex2 = 1 sGraphContact = graph.getSparseGraph(edgeTypeIndex1) sGraphInfect = graph.getSparseGraph(edgeTypeIndex2) sGraph = sGraphInfect #sGraph = sGraph.subgraph(range(0, 500)) graphStats = GraphStatistics() statsArray = graphStats.scalarStatistics(sGraph, False) slowStats = True
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) plotHIV = False plotCitation = False plotBemol = True saveResults = False findEigs = False if plotHIV: def getIterator(): generator = HIVIterGenerator() return generator.getIterator() resultsDir = PathDefaults.getOutputDir() + "cluster/HIV/Stats/" if plotCitation: def getIterator(): maxGraphSize = None generator = CitationIterGenerator(maxGraphSize=maxGraphSize) return generator.getIterator() resultsDir = PathDefaults.getOutputDir() + "cluster/Citation/Stats/" if plotBemol: def getIterator(): dataDir = PathDefaults.getDataDir() + "cluster/" nbUser = 10000 # set to 'None' to have all users nbPurchasesPerIt = 500 # set to 'None' to take all the purchases per date
def saveAuthors(): path = "/local/dhanjalc/dataDump-28-11-12/" fileName = path + "articleMetadata500000" if not os.path.exists(fileName): path = PathDefaults.getDataDir() + "erasm/" fileName = path + "articleMetadata1000000" logging.debug("Loading article metadata from " + fileName) fileObj = open(fileName, 'r') vertexIdDict = {} vertexIdSet = set([]) vertexIdList = [] edgeSet = set([]) edgeArray = [] i = 0 lineInd = 0 emptyAuthors = 0 edgeFileName = PathDefaults.getOutputDir() + "edges.txt" edgesFile = open(edgeFileName, "w") lineBuffer = "" for line in fileObj: if lineInd % 1000 == 0: print("Line " + str(lineInd) + " Author " + str(len(vertexIdSet)) + " empty author strings " + str(emptyAuthors)) if len(lineBuffer) != 0: edgesFile.write(lineBuffer) lineBuffer = "" articleMetaData = json.loads(line) if "authors" in articleMetaData: authors = articleMetaData["authors"] del articleMetaData coauthorList = [] for author in authors: authorString = "".join([author["forename"], " ", author["surname"]]) authorString = authorString.strip() if len(authorString) != 0: if authorString not in vertexIdSet: vertexIdDict[authorString] = len(vertexIdSet) vertexIdSet.add(authorString) coauthorList.append(authorString) del authorString else: emptyAuthors += 1 iterator = itertools.combinations(coauthorList, 2) del coauthorList for vId1, vId2 in iterator: #Note that we will have duplicate edges lineBuffer += str(vertexIdDict[vId1]) + ", " + str(vertexIdDict[vId2]) + "\n" lineInd += 1 edgesFile.close() print(sys.getsizeof(vertexIdDict)) print(sys.getsizeof(vertexIdSet)) print(sys.getsizeof(vertexIdList)) print(sys.getsizeof(edgeSet)) print(sys.getsizeof(edgeArray)) logging.debug("Saved edges as " + edgeFileName)
featureInds = numpy.arange(featureInds.shape[0])[featureInds] matcher = GraphMatch("PATH", alpha=0.5, featureInds=featureInds, useWeightM=False) graphMetrics = HIVGraphMetrics2(targetGraph, 1.0, matcher, float(endDate)) times, infectedIndices, removedIndices, graph = HIVModelUtils.simulate(thetaArray[i], startDate, endDate, recordStep, M, graphMetrics) times, vertexArray, removedGraphStats = HIVModelUtils.generateStatistics(graph, startDate, endDate, recordStep) stats = times, vertexArray, removedGraphStats, graphMetrics.dists, graphMetrics.graphDists, graphMetrics.labelDists Util.savePickle(stats, resultsFileName) if saveResults: for j, endDate in enumerate(endDates): resultsDir = PathDefaults.getOutputDir() + "viroscopy/real/theta" + str(j) + "/" outputDir = resultsDir + "stats/" logging.debug(resultsDir) newNumRecordSteps = numRecordSteps + 5 endDate += HIVModelUtils.realTestPeriods[j] recordStep = (endDate-startDate)/float(newNumRecordSteps) for i in range(maxT): thetaArray, distArray = loadThetaArray(N, resultsDir, i) if thetaArray.shape[0] == N: t = i thetaArray = loadThetaArray(N, resultsDir, t)[0] logging.debug(thetaArray)
def testGetOutputDir(self): print((PathDefaults.getOutputDir()))
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) numExamples = 500 rank = 50 A = numpy.random.rand(numExamples, numExamples) A = A.dot(A.T) s, U = numpy.linalg.eig(A) U = U[:, 0:rank] #Make sure result is non-negative by taking the absolute value of single vectors U = numpy.abs(U) B = numpy.random.rand(numExamples, numExamples) B = B.dot(B.T) s, V = numpy.linalg.eig(B) V = V[:, 0:rank] V = numpy.abs(V) s = numpy.random.rand(rank) X = (U*s).dot(V.T) #Save matrix outputDir = PathDefaults.getOutputDir() + "erasm/" fileName = outputDir + "Toy" scipy.io.mmwrite(fileName, X) logging.debug("Saved to file " + fileName + ".mtx")
def __init__(self, dataDict, YCortisol, YTesto, YIgf1, ages, numProcesses=1, runCortisol=True, runTestosterone=True, runIGF1=True): """ Create a new object for run the metabolomics experiments """ self.dataDict = dataDict self.runCartTreeRank = False self.runRbfSvmTreeRank = False self.runL1SvmTreeRank = False self.runCartTreeRankForest = False self.runRbfSvmTreeRankForest = False self.runL1SvmTreeRankForest = False self.runRankBoost = False self.runRankSVM = False self.runCortisol = runCortisol self.runTestosterone = runTestosterone self.runIGF1 = runIGF1 self.YCortisol = YCortisol self.YTesto = YTesto self.YIgf1 = YIgf1 self.ages = ages self.outerFolds = 3 self.innerFolds = 5 self.leafRankFolds = 3 self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/" self.numProcesses = numProcesses #General params Cs = 2.0**numpy.arange(-5, 7, 2, dtype=numpy.float) gammas = 2.0**numpy.arange(-5, 3, 2, dtype=numpy.float) depths = numpy.array([2, 4, 8]) numTrees = 20 sampleSize = 1.0 maxDepth = 10 featureSize = 0.5 #CART TreeRank leafRankParamDict = {} leafRankParamDict["setMaxDepth"] = depths leafRankLearner = DecisionTree(leafRankParamDict, self.leafRankFolds) self.cartTreeRank = TreeRank(leafRankLearner, numProcesses=numProcesses) self.cartTreeRankParams = {} self.cartTreeRankParams["setMaxDepth"] = depths #RBF SVM TreeRank leafRankParamDict = {} leafRankParamDict["setC"] = Cs leafRankParamDict["setGamma"] = gammas leafRankLearner = SVMLeafRank(leafRankParamDict, self.leafRankFolds) leafRankLearner.setKernel("rbf") leafRankLearner.processes = 1 self.rbfSvmTreeRank = TreeRank(leafRankLearner, numProcesses=numProcesses) self.rbfSvmTreeRankParams = {} self.rbfSvmTreeRankParams["setMaxDepth"] = depths #Linear L1 SVM TreeRank leafRankParamDict = {} leafRankParamDict["setC"] = Cs leafRankLearner = SVMLeafRank(leafRankParamDict, self.leafRankFolds) leafRankLearner.setKernel("linear") leafRankLearner.setPenalty("l1") leafRankLearner.processes = 1 self.l1SvmTreeRank = TreeRank(leafRankLearner, numProcesses=numProcesses) self.l1SvmTreeRankParams = {} self.l1SvmTreeRankParams["setMaxDepth"] = depths #CART TreeRankForest leafRankParamDict = {} leafRankParamDict["setMaxDepth"] = depths leafRankLearner = DecisionTree(leafRankParamDict, self.leafRankFolds) leafRankLearner.processes = 1 self.cartTreeRankForest = TreeRankForest(leafRankLearner, numProcesses=numProcesses) self.cartTreeRankForest.setNumTrees(numTrees) self.cartTreeRankForest.setSampleSize(sampleSize) self.cartTreeRankForest.setFeatureSize(featureSize) self.cartTreeRankForestParams = {} self.cartTreeRankForestParams["setMaxDepth"] = numpy.array([maxDepth]) self.cartTreeRankForestParams["setSampleSize"] = numpy.array([0.5, 0.75, 1.0]) self.cartTreeRankForestParams["setFeatureSize"] = numpy.array([0.5, 0.75, 1.0]) #RBF SVM TreeRankForest leafRankParamDict = {} leafRankParamDict["setC"] = Cs leafRankParamDict["setGamma"] = gammas leafRankLearner = SVMLeafRank(leafRankParamDict, self.leafRankFolds) leafRankLearner.setKernel("rbf") leafRankLearner.processes = 1 self.rbfSvmTreeRankForest = TreeRankForest(leafRankLearner, numProcesses=numProcesses) self.rbfSvmTreeRankForest.setNumTrees(numTrees) self.rbfSvmTreeRankForest.setSampleSize(sampleSize) self.rbfSvmTreeRankForest.setFeatureSize(featureSize) self.rbfSvmTreeRankForestParams = {} self.rbfSvmTreeRankForestParams["setMaxDepth"] = numpy.array([maxDepth]) self.rbfSvmTreeRankForestParams["setSampleSize"] = numpy.array([0.5, 0.75, 1.0]) self.rbfSvmTreeRankForestParams["setFeatureSize"] = numpy.array([0.5, 0.75, 1.0]) #L1 SVM TreeRankForest leafRankParamDict = {} leafRankParamDict["setC"] = Cs leafRankLearner = SVMLeafRank(leafRankParamDict, self.leafRankFolds) leafRankLearner.setKernel("linear") leafRankLearner.setPenalty("l1") leafRankLearner.processes = 1 self.l1SvmTreeRankForest = TreeRankForest(leafRankLearner, numProcesses=numProcesses) self.l1SvmTreeRankForest.setNumTrees(numTrees) self.l1SvmTreeRankForest.setSampleSize(sampleSize) self.l1SvmTreeRankForest.setFeatureSize(featureSize) self.l1SvmTreeRankForestParams = {} self.l1SvmTreeRankForestParams["setMaxDepth"] = numpy.array([maxDepth]) self.l1SvmTreeRankForestParams["setSampleSize"] = numpy.array([0.5, 0.75, 1.0]) self.l1SvmTreeRankForestParams["setFeatureSize"] = numpy.array([0.5, 0.75, 1.0]) #RankBoost self.rankBoost = RankBoost(numProcesses=numProcesses) self.rankBoostParams = {} self.rankBoostParams["setIterations"] = numpy.array([10, 50, 100]) self.rankBoostParams["setLearners"] = numpy.array([5, 10, 20]) #RankSVM self.rankSVM = RankSVM(numProcesses=numProcesses) self.rankSVM.setKernel("rbf") self.rankSVMParams = {} self.rankSVMParams["setC"] = 2.0**numpy.arange(0, 3, dtype=numpy.float) self.rankSVMParams["setGamma"] = 2.0**numpy.arange(-3, 0, dtype=numpy.float) #Store all the label vectors and their missing values self.hormoneDict = {} if self.runCortisol: self.hormoneDict["Cortisol"] = YCortisol if self.runTestosterone: self.hormoneDict["Testosterone"] = YTesto if self.runIGF1: self.hormoneDict["IGF1"] = YIgf1
""" Plot the ROC curves for the metabolomics experiment. """ import sys import numpy import logging import matplotlib.pyplot as plt from apgl.util.Util import Util from apgl.util.PathDefaults import PathDefaults from apgl.util.Latex import Latex logging.basicConfig(stream=sys.stdout, level=logging.WARN) resultsDir = PathDefaults.getOutputDir() + "metabolomics/" figureDir = resultsDir + "figures/" labelNames = ["Testosterone.val_0", "Testosterone.val_1", "Testosterone.val_2"] labelNames.extend(["Cortisol.val_0", "Cortisol.val_1", "Cortisol.val_2"]) labelNames.extend(["IGF1.val_0", "IGF1.val_1", "IGF1.val_2"]) labelNames2 = ["Testosterone.val", "Cortisol.val", "IGF1.val"] algorithmNames = ["TreeRank"] #algorithmNames = ["TreeRankForest"] leafRankNames = ["CART", "SVM", "RBF-SVM", "LinearSVM-PCA"] #leafRankNames = ["CARTF", "SVMF", "RBF-SVMF"] dataTypes = ["raw_std", "log", "opls"] #dataTypes = [] Ns = [10, 25, 50, 75, 100] dataTypes.append("Db4") dataTypes.append("Db8")
def runToyExp(datasetNames, sampleSizes, foldsSet, cvScalings, sampleMethods, numProcesses, fileNameSuffix): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" outputDir = PathDefaults.getOutputDir() + "modelPenalisation/" svm = LibSVM() numCs = svm.getCs().shape[0] numGammas = svm.getGammas().shape[0] numMethods = 1 + (1 + cvScalings.shape[0]) numParams = 2 runIdeal = True runCv = True runVfpen = True for i in range(len(datasetNames)): datasetName = datasetNames[i][0] numRealisations = datasetNames[i][1] logging.debug("Learning using dataset " + datasetName) for s in range(len(sampleMethods)): sampleMethod = sampleMethods[s][1] outfileName = outputDir + datasetName + sampleMethods[s][0] + fileNameSuffix fileLock = FileLock(outfileName + ".npz") if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() errors = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods)) params = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numParams)) errorGrids = numpy.zeros( (numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas) ) approxGrids = numpy.zeros( (numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas) ) idealGrids = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numCs, numGammas)) data = numpy.load(dataDir + datasetName + ".npz") gridPoints, trainX, trainY, pdfX, pdfY1X, pdfYminus1X = ( data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"], ) # We form a test set from the grid points testX = numpy.zeros((gridPoints.shape[0] ** 2, 2)) for m in range(gridPoints.shape[0]): testX[m * gridPoints.shape[0] : (m + 1) * gridPoints.shape[0], 0] = gridPoints testX[m * gridPoints.shape[0] : (m + 1) * gridPoints.shape[0], 1] = gridPoints[m] for j in range(numRealisations): Util.printIteration(j, 1, numRealisations, "Realisation: ") for k in range(sampleSizes.shape[0]): sampleSize = sampleSizes[k] for m in range(foldsSet.shape[0]): folds = foldsSet[m] logging.debug("Using sample size " + str(sampleSize) + " and " + str(folds) + " folds") perm = numpy.random.permutation(trainX.shape[0]) trainInds = perm[0:sampleSize] validX = trainX[trainInds, :] validY = trainY[trainInds] svm = LibSVM(processes=numProcesses) # Find ideal penalties if runIdeal: logging.debug("Finding ideal grid of penalties") idealGrids[j, k, m, :, :] = parallelPenaltyGridRbf( svm, validX, validY, testX, gridPoints, pdfX, pdfY1X, pdfYminus1X ) # Cross validation if runCv: logging.debug("Running V-fold cross validation") methodInd = 0 idx = sampleMethod(folds, validY.shape[0]) if sampleMethod == Sampling.bootstrap: bootstrap = True else: bootstrap = False bestSVM, cvGrid = svm.parallelVfcvRbf(validX, validY, idx, True, bootstrap) predY, decisionsY = bestSVM.predict(testX, True) decisionGrid = numpy.reshape( decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F" ) errors[j, k, m, methodInd] = ModelSelectUtils.bayesError( gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X ) params[j, k, m, methodInd, :] = numpy.array([bestSVM.getC(), bestSVM.getKernelParams()]) errorGrids[j, k, m, methodInd, :, :] = cvGrid # v fold penalisation if runVfpen: logging.debug("Running penalisation") # BIC penalisation Cv = float((folds - 1) * numpy.log(validX.shape[0]) / 2) tempCvScalings = cvScalings * (folds - 1) tempCvScalings = numpy.insert(tempCvScalings, 0, Cv) # Use cross validation idx = sampleMethod(folds, validY.shape[0]) svmGridResults = svm.parallelVfPenRbf(validX, validY, idx, tempCvScalings) for n in range(len(tempCvScalings)): bestSVM, trainErrors, approxGrid = svmGridResults[n] methodInd = n + 1 predY, decisionsY = bestSVM.predict(testX, True) decisionGrid = numpy.reshape( decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F" ) errors[j, k, m, methodInd] = ModelSelectUtils.bayesError( gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X ) params[j, k, m, methodInd, :] = numpy.array( [bestSVM.getC(), bestSVM.getKernelParams()] ) errorGrids[j, k, m, methodInd, :, :] = trainErrors + approxGrid approxGrids[j, k, m, methodInd, :, :] = approxGrid meanErrors = numpy.mean(errors, 0) print(meanErrors) meanParams = numpy.mean(params, 0) print(meanParams) meanErrorGrids = numpy.mean(errorGrids, 0) stdErrorGrids = numpy.std(errorGrids, 0) meanIdealGrids = numpy.mean(idealGrids, 0) stdIdealGrids = numpy.std(idealGrids, 0) meanApproxGrids = numpy.mean(approxGrids, 0) stdApproxGrids = numpy.std(approxGrids, 0) numpy.savez( outfileName, errors, params, meanErrorGrids, stdErrorGrids, meanIdealGrids, stdIdealGrids, meanApproxGrids, stdApproxGrids, ) logging.debug("Saved results as file " + outfileName + ".npz") fileLock.unlock() else: logging.debug("Results already computed") logging.debug("All done!")