def load(cls, filename): """ Load the graph object from the corresponding file. Data is loaded in a zip format as created using save(). :param filename: The name of the file to load. :type filename: :class:`str` :returns: A graph corresponding to the one saved in filename. """ Parameter.checkClass(filename, str) import zipfile (path, filename) = os.path.split(filename) if path == "": path = "./" tempPath = tempfile.mkdtemp() originalPath = os.getcwd() try: os.chdir(path) myzip = zipfile.ZipFile(filename + '.zip', 'r') myzip.extractall(tempPath) myzip.close() os.chdir(tempPath) #Deal with legacy files try: W = cls.loadMatrix(cls._wFilename) metaDict = Util.loadPickle(cls._metaFilename) vList = globals()[metaDict["vListType"]].load(cls._verticesFilename) undirected = metaDict["undirected"] except IOError: W = cls.loadMatrix(filename + cls._matExt) vList = VertexList.load(filename) undirected = Util.loadPickle(filename + cls._boolExt) graph = cls(vList, undirected) graph.W = W for tempFile in myzip.namelist(): os.remove(tempFile) finally: os.chdir(originalPath) os.rmdir(tempPath) return graph
def plotOtherStats(): #Let's look at geodesic distances in subgraphs and communities logging.info("Computing other stats") resultsFileName = resultsDir + "ContactGrowthOtherStats.pkl" hivGraphStats = HIVGraphStatistics(fInds) if saveResults: statsArray = hivGraphStats.sequenceScalarStats(sGraph, subgraphIndicesList) #statsArray["dayList"] = absDayList Util.savePickle(statsArray, resultsFileName, True) else: statsArray = Util.loadPickle(resultsFileName) #Just load the harmonic geodesic distances of the full graph resultsFileName = resultsDir + "ContactGrowthScalarStats.pkl" statsArray2 = Util.loadPickle(resultsFileName) global plotInd msmGeodesic = statsArray[:, hivGraphStats.msmGeodesicIndex] msmGeodesic[msmGeodesic < 0] = 0 msmGeodesic[msmGeodesic == float('inf')] = 0 #Output all the results into plots plt.figure(plotInd) plt.plot(absDayList, msmGeodesic, 'k-', absDayList, statsArray[:, hivGraphStats.mostConnectedGeodesicIndex], 'k--') plt.xticks(locs, labels) #plt.ylim([0, 0.1]) plt.xlabel("Year") plt.ylabel("Mean harmonic geodesic distance") plt.legend(("MSM individuals", "Top 10% degree"), loc="upper right") plt.savefig(figureDir + "MSM10Geodesic" + ".eps") plotInd += 1 plt.figure(plotInd) plt.plot(absDayList, statsArray2[:, graphStats.harmonicGeoDistanceIndex], 'k-', absDayList, statsArray[:, hivGraphStats.menSubgraphGeodesicIndex], 'k--') plt.xticks(locs, labels) plt.ylim([0, 200.0]) plt.xlabel("Year") plt.ylabel("Mean harmonic geodesic distance") plt.legend(("All individuals", "Men subgraph"), loc="upper right") plt.savefig(figureDir + "MenSubgraphGeodesic" + ".eps") plotInd += 1
def load(filename): """ Load this object from filename.pkl. :param filename: The name of the file to load. :type filename: :class:`str` """ V = Util.loadPickle(filename + GeneralVertexList.ext) vList = GeneralVertexList(len(V)) vList.V = V return vList
def loadResultsFile(fileName): resultsDict = Util.loadPickle(fileName) allMetrics = resultsDict["allMetrics"] bestAUCs = allMetrics[2] trainROCs = allMetrics[1] testROCs = allMetrics[3] logging.debug("Mean AUC is " + str(numpy.mean(bestAUCs))) logging.debug("Best parameters are " + str(resultsDict["bestParams"])) meanAUC = numpy.mean(bestAUCs) stdAUC = numpy.std(bestAUCs) return meanAUC, stdAUC, trainROCs, testROCs
def coauthorsGraph(self, field, relevantExperts): """ Using the relevant authors we find all coauthors. """ if not os.path.exists(self.getCoauthorsFilename(field)) or self.overwriteGraph: logging.debug("Finding coauthors of relevant experts") graph, authorIndexer = self.coauthorsGraphFromAuthors(set(relevantExperts)) logging.debug(graph.summary()) Util.savePickle([graph, authorIndexer], self.getCoauthorsFilename(field), debug=True) else: logging.debug("Files already generated: " + self.getCoauthorsFilename(field)) graph, authorIndexer = Util.loadPickle(self.getCoauthorsFilename(field)) return graph, authorIndexer
def findSimilarDocumentsLSI(self, field): """ We use LSI from gensim in this case """ self.computeLSI() self.loadVectoriser() lsi, index = Util.loadPickle(self.modelFilename) newX = self.vectoriser.transform([field]) newX = [(i, newX[0, i])for i in newX.nonzero()[1]] result = lsi[newX] similarities = index[result] expertsByDocSimilarity, expertsByCitations = self.expertsFromDocSimilarities(similarities, len(self.trainExpertDict[field]), field) return expertsByDocSimilarity, expertsByCitations
def findSimilarDocumentsLDA(self, field): """ We use LDA in this case """ self.computeLDA() self.loadVectoriser() lda, index = Util.loadPickle(self.modelFilename) newX = self.vectoriser.transform([field]) newX = [(i, newX[0, i])for i in newX.nonzero()[1]] result = lda[newX] #Cosine similarity similarities = index[result] expertsByDocSimilarity, expertsByCitations = self.expertsFromDocSimilarities(similarities, len(self.trainExpertDict[field]), field) logging.debug("Number of relevant authors : " + str(len(expertsByDocSimilarity))) return expertsByDocSimilarity, expertsByCitations
def plotVectorStats(): #Finally, compute some vector stats at various points in the graph logging.info("Computing vector stats") global plotInd resultsFileName = resultsDir + "InfectGrowthVectorStats.pkl" if saveResults: statsDictList = graphStats.sequenceVectorStats(sGraph, subgraphIndicesList2, True) Util.savePickle(statsDictList, resultsFileName, True) else: statsDictList = Util.loadPickle(resultsFileName) treeSizesDistArray = numpy.zeros((len(dayList2), 3000)) treeDepthsDistArray = numpy.zeros((len(dayList2), 100)) numVerticesEdgesArray = numpy.zeros((len(dayList2), 2), numpy.int) numVerticesEdgesArray[:, 0] = [len(sgl) for sgl in subgraphIndicesList2] numVerticesEdgesArray[:, 1] = [sGraph.subgraph(sgl).getNumEdges() for sgl in subgraphIndicesList2] for j in range(len(dayList2)): dateStr = (str(DateUtils.getDateStrFromDay(dayList2[j], startYear))) logging.info(dateStr) statsDict = statsDictList[j] degreeDist = statsDict["outDegreeDist"] degreeDist = degreeDist/float(numpy.sum(degreeDist)) maxEigVector = statsDict["maxEigVector"] maxEigVector = numpy.flipud(numpy.sort(numpy.abs(maxEigVector))) maxEigVector = numpy.log(maxEigVector[maxEigVector>0]) treeSizesDist = statsDict["treeSizesDist"] treeSizesDist = numpy.array(treeSizesDist, numpy.float64)/numpy.sum(treeSizesDist) treeSizesDistArray[j, 0:treeSizesDist.shape[0]] = treeSizesDist treeDepthsDist = statsDict["treeDepthsDist"] #treeDepthsDist = numpy.array(treeDepthsDist, numpy.float64)/numpy.sum(treeDepthsDist) treeDepthsDist = numpy.array(treeDepthsDist, numpy.float64) treeDepthsDistArray[j, 0:treeDepthsDist.shape[0]] = treeDepthsDist plotInd2 = plotInd plt.figure(plotInd2) plt.plot(numpy.arange(degreeDist.shape[0]), degreeDist, label=dateStr) plt.xlabel("Degree") plt.ylabel("Probability") plt.ylim((0, 0.8)) plt.legend() plt.savefig(figureDir + "DegreeDist" + ".eps") plotInd2 += 1 plt.figure(plotInd2) plt.scatter(numpy.arange(treeSizesDist.shape[0])[treeSizesDist!=0], numpy.log(treeSizesDist[treeSizesDist!=0]), s=30, c=plotStyles2[j][0], label=dateStr) plt.xlabel("Size") plt.ylabel("log(probability)") plt.xlim((0, 125)) plt.legend() plt.savefig(figureDir + "TreeSizeDist" + ".eps") plotInd2 += 1 plt.figure(plotInd2) plt.scatter(numpy.arange(treeDepthsDist.shape[0])[treeDepthsDist!=0], numpy.log(treeDepthsDist[treeDepthsDist!=0]), s=30, c=plotStyles2[j][0], label=dateStr) plt.xlabel("Depth") plt.ylabel("log(probability)") plt.xlim((0, 15)) plt.legend() plt.savefig(figureDir + "TreeDepthDist" + ".eps") plotInd2 += 1 dateStrList = [DateUtils.getDateStrFromDay(day, startYear) for day in dayList2] precision = 4 treeSizesDistArray = treeSizesDistArray[:, 0:treeSizesDist.shape[0]] nonZeroCols = numpy.sum(treeSizesDistArray, 0)!=0 print((Latex.array1DToRow(numpy.arange(treeSizesDistArray.shape[1])[nonZeroCols]))) print((Latex.array2DToRows(treeSizesDistArray[:, nonZeroCols]))) print("Tree depths") treeDepthsDistArray = treeDepthsDistArray[:, 0:treeDepthsDist.shape[0]] nonZeroCols = numpy.sum(treeDepthsDistArray, 0)!=0 print((Latex.array1DToRow(numpy.arange(treeDepthsDistArray.shape[1])[nonZeroCols]))) print((Latex.array2DToRows(treeDepthsDistArray[:, nonZeroCols]))) print(numpy.sum(treeDepthsDistArray[:, 0:3], 1)) print("Edges and verticies") print(Latex.listToRow(dateStrList)) print(Latex.array2DToRows(numVerticesEdgesArray.T, precision))
def plotScalarStats(): logging.info("Computing scalar stats") resultsFileName = resultsDir + "InfectGrowthScalarStats.pkl" if saveResults: statsArray = graphStats.sequenceScalarStats(sGraph, subgraphIndicesList, treeStats=True) Util.savePickle(statsArray, resultsFileName, True) else: statsArray = Util.loadPickle(resultsFileName) global plotInd #Output all the results into plots #Take the mean of the results over the configuration model graphs resultsFileNameBase = resultsDir + "ConfigInfectGraphScalarStats" numGraphs = len(subgraphIndicesList) configStatsArrays = numpy.zeros((numGraphs, graphStats.getNumStats(), numConfigGraphs)) for j in range(numConfigGraphs): resultsFileName = resultsFileNameBase + str(j) configStatsArrays[:, :, j] = Util.loadPickle(resultsFileName) configStatsArray = numpy.mean(configStatsArrays, 2) configStatsStd = numpy.std(configStatsArrays, 2) #Make sure we don't include 0 in the array vertexIndex = numpy.argmax(statsArray[:, graphStats.numVerticesIndex] > 0) edgeIndex = numpy.argmax(statsArray[:, graphStats.numEdgesIndex] > 0) minIndex = numpy.maximum(vertexIndex, edgeIndex) def plotRealConfigError(index, styleReal, styleConfig, realLabel, configLabel): plt.hold(True) plt.plot(absDayList, statsArray[:, index], styleReal, label=realLabel) #errors = numpy.c_[configStatsArray[:, index]-configStatsMinArray[:, index] , configStatsMaxArray[:, index]-configStatsArray[:, index]].T errors = numpy.c_[configStatsStd[:, index], configStatsStd[:, index]].T plt.plot(absDayList, configStatsArray[:, index], styleConfig, label=configLabel) plt.errorbar(absDayList, configStatsArray[:, index], errors, linewidth=0, elinewidth=0, label="_nolegend_", ecolor=styleConfig[0]) xmin, xmax = plt.xlim() plt.xlim((0, xmax)) ymin, ymax = plt.ylim() plt.ylim((0, ymax)) plt.figure(plotInd) plt.plot(numpy.log(statsArray[minIndex:, graphStats.numVerticesIndex]), numpy.log(statsArray[minIndex:, graphStats.numEdgesIndex])) plt.xlabel("log(|V|)") plt.ylabel("log(|E|)") plt.savefig(figureDir + "LogVerticesEdgesGrowth.eps") plotInd += 1 plt.figure(plotInd) #plt.plot(absDayList, statsArray[:, graphStats.numTreesIndex], plotStyles3[0], label="Trees Size >= 1") #plt.plot(absDayList, statsArray[:, graphStats.numNonSingletonTreesIndex], plotStyles3[1], label="Trees Size >= 2") plotRealConfigError(graphStats.numTreesIndex, plotStyles3[0], plotStyles5[0], "Trees size >= 1", "CM trees size >= 1") plotRealConfigError(graphStats.numNonSingletonTreesIndex, plotStyles3[0], plotStyles5[0], "Trees size >= 2", "CM trees size >= 2") plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("No. trees") plt.legend(loc="upper left") plt.savefig(figureDir + "NumTreesGrowth.eps") plotInd += 1 for k in range(len(dayList)): day = dayList[k] print(str(DateUtils.getDateStrFromDay(day, startYear)) + ": " + str(statsArray[k, graphStats.numTreesIndex])) print(str(DateUtils.getDateStrFromDay(day, startYear)) + ": " + str(configStatsArray[k, graphStats.numTreesIndex])) #Load stats from a file to get the max tree from its root resultsFilename = resultsDir + "treeSizesDepths.npz" file = open(resultsFilename, 'r') arrayDict = numpy.load(file) statsArray[:, graphStats.maxTreeDepthIndex] = arrayDict["arr_0"] statsArray[:, graphStats.maxTreeSizeIndex] = arrayDict["arr_1"] statsArray[:, graphStats.secondTreeDepthIndex] = arrayDict["arr_2"] statsArray[:, graphStats.secondTreeSizeIndex] = arrayDict["arr_3"] plt.figure(plotInd) plotRealConfigError(graphStats.maxTreeSizeIndex, plotStyles3[0], plotStyles5[0], "Max tree", "CM max tree") plotRealConfigError(graphStats.secondTreeSizeIndex, plotStyles3[1], plotStyles5[1], "2nd tree", "CM 2nd tree") plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("Size") plt.legend(loc="upper left") plt.savefig(figureDir + "MaxTreeGrowth.eps") plotInd += 1 plt.figure(plotInd) plotRealConfigError(graphStats.maxTreeDepthIndex, plotStyles3[0], plotStyles5[0], "Max tree", "CM max tree") plotRealConfigError(graphStats.secondTreeDepthIndex, plotStyles3[1], plotStyles5[1], "2nd tree", "CM 2nd tree") #plt.plot(absDayList, statsArray[:, graphStats.maxTreeDepthIndex], plotStyles3[0], absDayList, statsArray[:, graphStats.secondTreeDepthIndex], plotStyles3[1] ) #plt.plot(absDayList, configStatsArray[:, graphStats.maxTreeDepthIndex], plotStyles4[0], absDayList, configStatsArray[:, graphStats.secondTreeDepthIndex], plotStyles4[1]) plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("Depth") plt.legend(loc="lower right") plt.savefig(figureDir + "MaxTreeDepthGrowth.eps") plotInd += 1
def plotTreeStats(): logging.info("Computing tree stats") resultsFileName = resultsDir + "InfectGrowthTreeStats.pkl" if saveResults: statsDictList = [] for j in range(len(subgraphIndicesList2)): Util.printIteration(j, 1, len(subgraphIndicesList2)) subgraphIndices = subgraphIndicesList2[j] subgraph = sGraph.subgraph(subgraphIndices) logging.info("Finding trees") trees = subgraph.findTrees() logging.info("Computing tree statistics") statsDict = {} locationEntropy = [] orientEntropy = [] detectionRanges = [] for i in range(len(trees)): if len(trees[i]) > 1: treeGraph = subgraph.subgraph(trees[i]) vertexArray = treeGraph.getVertexList().getVertices(list(range(treeGraph.getNumVertices()))) locationEntropy.append(Util.entropy(vertexArray[:, locationIndex])) orientEntropy.append(Util.entropy(vertexArray[:, orientationIndex])) detections = vertexArray[:, detectionIndex] detectionRanges.append(numpy.max(detections) - numpy.min(detections)) statsDict["locationEnt"] = numpy.array(locationEntropy) statsDict["orientEnt"] = numpy.array(orientEntropy) statsDict["detectRanges"] = numpy.array(detectionRanges) statsDictList.append(statsDict) Util.savePickle(statsDictList, resultsFileName, True) else: statsDictList = Util.loadPickle(resultsFileName) locBins = numpy.arange(0, 2.4, 0.2) detectBins = numpy.arange(0, 6500, 500) locationEntDists = [] orientEntDists = [] detectionDists = [] for j in range(0, len(dayList2)): dateStr = (str(DateUtils.getDateStrFromDay(dayList2[j], startYear))) logging.info(dateStr) statsDict = statsDictList[j] plotInd2 = plotInd locationEntDists.append(statsDict["locationEnt"]) orientEntDists.append(statsDict["orientEnt"]) detectionDists.append(statsDict["detectRanges"]) #for j in range(len(orientEntDists)): # print(numpy.sum(numpy.histogram(orientEntDists[j])[0])) # print(numpy.histogram(orientEntDists[j])[0]/float(orientEntDists[j].shape[0])) dateStrs = [DateUtils.getDateStrFromDay(dayList2[i], startYear) for i in range(1, len(dayList2))] plt.figure(plotInd2) histOut = plt.hist(locationEntDists, locBins, normed=True) plt.xlabel("Location Entropy") plt.ylabel("Probability Density") plt.savefig(figureDir + "LocationEnt" + ".eps") #plt.legend() plotInd2 += 1 plt.figure(plotInd2) histOut = plt.hist(orientEntDists, normed=True) plt.xlabel("Orientation Entropy") plt.ylabel("Probability Density") plt.savefig(figureDir + "OrientEnt" + ".eps") #plt.legend() plotInd2 += 1 plt.figure(plotInd2) histOut = plt.hist(detectionDists, detectBins, normed=True) plt.xlabel("Detection Range (days)") plt.ylabel("Probability Density") plt.savefig(figureDir + "DetectionRanges" + ".eps") #plt.legend() plotInd2 += 1
def plotVectorStats(): #Finally, compute some vector stats at various points in the graph logging.info("Computing vector stats") global plotInd resultsFileName = resultsDir + "ContactGrowthVectorStats.pkl" if saveResults: statsDictList = graphStats.sequenceVectorStats(sGraph, subgraphIndicesList2) Util.savePickle(statsDictList, resultsFileName, False) else: statsDictList = Util.loadPickle(resultsFileName) #Load up configuration model results configStatsDictList = [] resultsFileNameBase = resultsDir + "ConfigGraphVectorStats" for j in range(numConfigGraphs): resultsFileName = resultsFileNameBase + str(j) configStatsDictList.append(Util.loadPickle(resultsFileName)) #Now need to take mean of 1st element of list meanConfigStatsDictList = configStatsDictList[0] for i in range(len(configStatsDictList[0])): for k in range(1, numConfigGraphs): for key in configStatsDictList[k][i].keys(): if configStatsDictList[k][i][key].shape[0] > meanConfigStatsDictList[i][key].shape[0]: meanConfigStatsDictList[i][key] = numpy.r_[meanConfigStatsDictList[i][key], numpy.zeros(configStatsDictList[k][i][key].shape[0] - meanConfigStatsDictList[i][key].shape[0])] elif configStatsDictList[k][i][key].shape[0] < meanConfigStatsDictList[i][key].shape[0]: configStatsDictList[k][i][key] = numpy.r_[configStatsDictList[k][i][key], numpy.zeros(meanConfigStatsDictList[i][key].shape[0] - configStatsDictList[k][i][key].shape[0])] meanConfigStatsDictList[i][key] += configStatsDictList[k][i][key] for key in configStatsDictList[0][i].keys(): meanConfigStatsDictList[i][key] = meanConfigStatsDictList[i][key]/numConfigGraphs triangleDistArray = numpy.zeros((len(dayList2), 100)) configTriangleDistArray = numpy.zeros((len(dayList2), 100)) hopPlotArray = numpy.zeros((len(dayList2), 27)) configHopPlotArray = numpy.zeros((len(dayList2), 30)) componentsDistArray = numpy.zeros((len(dayList2), 3000)) configComponentsDistArray = numpy.zeros((len(dayList2), 3000)) numVerticesEdgesArray = numpy.zeros((len(dayList2), 2), numpy.int) numVerticesEdgesArray[:, 0] = [len(sgl) for sgl in subgraphIndicesList2] numVerticesEdgesArray[:, 1] = [sGraph.subgraph(sgl).getNumEdges() for sgl in subgraphIndicesList2] binWidths = numpy.arange(0, 0.50, 0.05) eigVectorDists = numpy.zeros((len(dayList2), binWidths.shape[0]-1), numpy.int) femaleSums = numpy.zeros(len(dayList2)) maleSums = numpy.zeros(len(dayList2)) heteroSums = numpy.zeros(len(dayList2)) biSums = numpy.zeros(len(dayList2)) contactSums = numpy.zeros(len(dayList2)) nonContactSums = numpy.zeros(len(dayList2)) donorSums = numpy.zeros(len(dayList2)) randomTestSums = numpy.zeros(len(dayList2)) stdSums = numpy.zeros(len(dayList2)) prisonerSums = numpy.zeros(len(dayList2)) recommendSums = numpy.zeros(len(dayList2)) meanAges = numpy.zeros(len(dayList2)) degrees = numpy.zeros((len(dayList2), 20)) provinces = numpy.zeros((len(dayList2), 15)) havanaSums = numpy.zeros(len(dayList2)) villaClaraSums = numpy.zeros(len(dayList2)) pinarSums = numpy.zeros(len(dayList2)) holguinSums = numpy.zeros(len(dayList2)) habanaSums = numpy.zeros(len(dayList2)) sanctiSums = numpy.zeros(len(dayList2)) meanDegrees = numpy.zeros(len(dayList2)) stdDegrees = numpy.zeros(len(dayList2)) #Note that death has a lot of missing values for j in range(len(dayList2)): dateStr = (str(DateUtils.getDateStrFromDay(dayList2[j], startYear))) logging.info(dateStr) statsDict = statsDictList[j] configStatsDict = meanConfigStatsDictList[j] degreeDist = statsDict["outDegreeDist"] degreeDist = degreeDist/float(numpy.sum(degreeDist)) #Note that degree distribution for configuration graph will be identical eigenDist = statsDict["eigenDist"] eigenDist = numpy.log(eigenDist[eigenDist>=10**-1]) #configEigenDist = configStatsDict["eigenDist"] #configEigenDist = numpy.log(configEigenDist[configEigenDist>=10**-1]) hopCount = statsDict["hopCount"] hopCount = numpy.log10(hopCount) hopPlotArray[j, 0:hopCount.shape[0]] = hopCount configHopCount = configStatsDict["hopCount"] configHopCount = numpy.log10(configHopCount) #configHopPlotArray[j, 0:configHopCount.shape[0]] = configHopCount triangleDist = statsDict["triangleDist"] #triangleDist = numpy.array(triangleDist, numpy.float64)/numpy.sum(triangleDist) triangleDist = numpy.array(triangleDist, numpy.float64) triangleDistArray[j, 0:triangleDist.shape[0]] = triangleDist configTriangleDist = configStatsDict["triangleDist"] configTriangleDist = numpy.array(configTriangleDist, numpy.float64)/numpy.sum(configTriangleDist) configTriangleDistArray[j, 0:configTriangleDist.shape[0]] = configTriangleDist maxEigVector = statsDict["maxEigVector"] eigenvectorInds = numpy.flipud(numpy.argsort(numpy.abs(maxEigVector))) top10eigenvectorInds = eigenvectorInds[0:numpy.round(eigenvectorInds.shape[0]/10.0)] maxEigVector = numpy.abs(maxEigVector[eigenvectorInds]) #print(maxEigVector) eigVectorDists[j, :] = numpy.histogram(maxEigVector, binWidths)[0] componentsDist = statsDict["componentsDist"] componentsDist = numpy.array(componentsDist, numpy.float64)/numpy.sum(componentsDist) componentsDistArray[j, 0:componentsDist.shape[0]] = componentsDist configComponentsDist = configStatsDict["componentsDist"] configComponentsDist = numpy.array(configComponentsDist, numpy.float64)/numpy.sum(configComponentsDist) configComponentsDistArray[j, 0:configComponentsDist.shape[0]] = configComponentsDist plotInd2 = plotInd plt.figure(plotInd2) plt.plot(numpy.arange(degreeDist.shape[0]), degreeDist, plotStyles2[j], label=dateStr) plt.xlabel("Degree") plt.ylabel("Probability") plt.ylim((0, 0.5)) plt.savefig(figureDir + "DegreeDist" + ".eps") plt.legend() plotInd2 += 1 """ plt.figure(plotInd2) plt.plot(numpy.arange(eigenDist.shape[0]), eigenDist, label=dateStr) plt.xlabel("Eigenvalue rank") plt.ylabel("log(Eigenvalue)") plt.savefig(figureDir + "EigenDist" + ".eps") plt.legend() plotInd2 += 1 """ #How does kleinberg do the hop plots plt.figure(plotInd2) plt.plot(numpy.arange(hopCount.shape[0]), hopCount, plotStyles[j], label=dateStr) plt.xlabel("k") plt.ylabel("log10(pairs)") plt.ylim( (2.5, 7) ) plt.legend(loc="lower right") plt.savefig(figureDir + "HopCount" + ".eps") plotInd2 += 1 plt.figure(plotInd2) plt.plot(numpy.arange(maxEigVector.shape[0]), maxEigVector, plotStyles2[j], label=dateStr) plt.xlabel("Rank") plt.ylabel("log(eigenvector coefficient)") plt.savefig(figureDir + "MaxEigVector" + ".eps") plt.legend() plotInd2 += 1 #Compute some information the 10% most central vertices subgraphIndices = numpy.nonzero(detections <= dayList2[j])[0] subgraph = sGraph.subgraph(subgraphIndices) subgraphVertexArray = subgraph.getVertexList().getVertices() femaleSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, genderIndex]==1) maleSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, genderIndex]==0) heteroSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, orientationIndex]==0) biSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, orientationIndex]==1) contactSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, contactIndex]) donorSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, donorIndex]) randomTestSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, randomTestIndex]) stdSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, stdIndex]) prisonerSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, prisonerIndex]) recommendSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, doctorIndex]) meanAges[j] = numpy.mean(subgraphVertexArray[top10eigenvectorInds, detectionIndex] - subgraphVertexArray[top10eigenvectorInds, dobIndex])/daysInYear havanaSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, havanaIndex]) villaClaraSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, villaClaraIndex]) pinarSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, pinarIndex]) holguinSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, holguinIndex]) habanaSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, habanaIndex]) sanctiSums[j] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, sanctiIndex]) provinces[j, :] = numpy.sum(subgraphVertexArray[top10eigenvectorInds, 22:37], 0) ddist = numpy.bincount(subgraph.outDegreeSequence()[top10eigenvectorInds]) degrees[j, 0:ddist.shape[0]] = numpy.array(ddist, numpy.float)/numpy.sum(ddist) meanDegrees[j] = numpy.mean(subgraph.outDegreeSequence()[top10eigenvectorInds]) stdDegrees[j] = numpy.std(subgraph.outDegreeSequence()[top10eigenvectorInds]) plt.figure(plotInd2) plt.plot(numpy.arange(degrees[j, :].shape[0]), degrees[j, :], plotStyles2[j], label=dateStr) plt.xlabel("Degree") plt.ylabel("Probability") #plt.ylim((0, 0.5)) plt.savefig(figureDir + "DegreeDistCentral" + ".eps") plt.legend() plotInd2 += 1 precision = 4 dateStrList = [DateUtils.getDateStrFromDay(day, startYear) for day in dayList2] print("Hop counts") print(Latex.listToRow(dateStrList)) print(Latex.array2DToRows(hopPlotArray.T)) print("\nHop counts for configuration graphs") print(Latex.listToRow(dateStrList)) print(Latex.array2DToRows(configHopPlotArray.T)) print("\n\nEdges and vertices") print((Latex.listToRow(dateStrList))) print((Latex.array2DToRows(numVerticesEdgesArray.T, precision))) print("\n\nEigenvector distribution") print((Latex.array1DToRow(binWidths[1:]) + "\\\\")) print((Latex.array2DToRows(eigVectorDists))) print("\n\nDistribution of component sizes") componentsDistArray = componentsDistArray[:, 0:componentsDist.shape[0]] nonZeroCols = numpy.sum(componentsDistArray, 0)!=0 componentsDistArray = numpy.r_[numpy.array([numpy.arange(componentsDistArray.shape[1])[nonZeroCols]]), componentsDistArray[:, nonZeroCols]] print((Latex.listToRow(dateStrList))) print((Latex.array2DToRows(componentsDistArray.T, precision))) print("\n\nDistribution of component sizes in configuration graphs") configComponentsDistArray = configComponentsDistArray[:, 0:configComponentsDist.shape[0]] nonZeroCols = numpy.sum(configComponentsDistArray, 0)!=0 configComponentsDistArray = numpy.r_[numpy.array([numpy.arange(configComponentsDistArray.shape[1])[nonZeroCols]]), configComponentsDistArray[:, nonZeroCols]] print((Latex.listToRow(dateStrList))) print((Latex.array2DToRows(configComponentsDistArray.T, precision))) print("\n\nDistribution of triangle participations") triangleDistArray = triangleDistArray[:, 0:triangleDist.shape[0]] nonZeroCols = numpy.sum(triangleDistArray, 0)!=0 triangleDistArray = numpy.r_[numpy.array([numpy.arange(triangleDistArray.shape[1])[nonZeroCols]])/2, triangleDistArray[:, nonZeroCols]] print((Latex.listToRow(dateStrList))) print((Latex.array2DToRows(triangleDistArray.T, precision))) configTriangleDistArray = configTriangleDistArray[:, 0:configTriangleDist.shape[0]] nonZeroCols = numpy.sum(configTriangleDistArray, 0)!=0 configTriangleDistArray = numpy.r_[numpy.array([numpy.arange(configTriangleDistArray.shape[1])[nonZeroCols]])/2, configTriangleDistArray[:, nonZeroCols]] configTriangleDistArray = numpy.c_[configTriangleDistArray, numpy.zeros((configTriangleDistArray.shape[0], triangleDistArray.shape[1]-configTriangleDistArray.shape[1]))] print("\n\nDistribution of central vertices") print((Latex.listToRow(dateStrList))) subgraphSizes = numpy.array(maleSums + femaleSums, numpy.float) print("Female & " + Latex.array1DToRow(femaleSums*100/subgraphSizes, 1) + "\\\\") print("Male & " + Latex.array1DToRow(maleSums*100/subgraphSizes, 1) + "\\\\") print("\hline") print("Heterosexual & " + Latex.array1DToRow(heteroSums*100/subgraphSizes, 1) + "\\\\") print("Bisexual & " + Latex.array1DToRow(biSums*100/subgraphSizes, 1) + "\\\\") print("\hline") print("Contact traced & " + Latex.array1DToRow(contactSums*100/subgraphSizes, 1) + "\\\\") print("Blood donor & " + Latex.array1DToRow(donorSums*100/subgraphSizes, 1) + "\\\\") print("RandomTest & " + Latex.array1DToRow(randomTestSums*100/subgraphSizes, 1) + "\\\\") print("STD & " + Latex.array1DToRow(stdSums*100/subgraphSizes, 1) + "\\\\") print("Prisoner & " + Latex.array1DToRow(prisonerSums*100/subgraphSizes, 1) + "\\\\") print("Doctor recommendation & " + Latex.array1DToRow(recommendSums*100/subgraphSizes, 1) + "\\\\") print("\hline") print("Mean ages (years) & " + Latex.array1DToRow(meanAges, 2) + "\\\\") print("\hline") print("Holguin & " + Latex.array1DToRow(holguinSums*100/subgraphSizes, 1) + "\\\\") print("La Habana & " + Latex.array1DToRow(habanaSums*100/subgraphSizes, 1) + "\\\\") print("Havana City & " + Latex.array1DToRow(havanaSums*100/subgraphSizes, 1) + "\\\\") print("Pinar del Rio & " + Latex.array1DToRow(pinarSums*100/subgraphSizes, 1) + "\\\\") print("Sancti Spiritus & " + Latex.array1DToRow(sanctiSums*100/subgraphSizes, 1) + "\\\\") print("Villa Clara & " + Latex.array1DToRow(villaClaraSums*100/subgraphSizes, 1) + "\\\\") print("\hline") print("Mean degrees & " + Latex.array1DToRow(meanDegrees, 2) + "\\\\") print("Std degrees & " + Latex.array1DToRow(stdDegrees, 2) + "\\\\") print("\n\nProvinces") print(Latex.array2DToRows(provinces)) print("\n\nDegree distribution") print(Latex.array2DToRows(degrees))
def plotScalarStats(): logging.info("Computing scalar stats") resultsFileName = resultsDir + "ContactGrowthScalarStats.pkl" if saveResults: statsArray = graphStats.sequenceScalarStats(sGraph, subgraphIndicesList, slowStats) Util.savePickle(statsArray, resultsFileName, True) #Now compute statistics on the configuration graphs else: statsArray = Util.loadPickle(resultsFileName) #Take the mean of the results over the configuration model graphs resultsFileNameBase = resultsDir + "ConfigGraphScalarStats" numGraphs = len(subgraphIndicesList) #configStatsArrays = numpy.zeros((numGraphs, graphStats.getNumStats(), numConfigGraphs)) configStatsArrays = numpy.zeros((numGraphs, graphStats.getNumStats()-2, numConfigGraphs)) for j in range(numConfigGraphs): resultsFileName = resultsFileNameBase + str(j) configStatsArrays[:, :, j] = Util.loadPickle(resultsFileName) configStatsArray = numpy.mean(configStatsArrays, 2) configStatsStd = numpy.std(configStatsArrays, 2) global plotInd def plotRealConfigError(index, styleReal, styleConfig, realLabel, configLabel): plt.hold(True) plt.plot(absDayList, statsArray[:, index], styleReal, label=realLabel) #errors = numpy.c_[configStatsArray[:, index]-configStatsMinArray[:, index] , configStatsMaxArray[:, index]-configStatsArray[:, index]].T errors = numpy.c_[configStatsStd[:, index], configStatsStd[:, index]].T plt.plot(absDayList, configStatsArray[:, index], styleConfig, label=configLabel) plt.errorbar(absDayList, configStatsArray[:, index], errors, linewidth=0, elinewidth=1, label="_nolegend_", ecolor="red") xmin, xmax = plt.xlim() plt.xlim((0, xmax)) ymin, ymax = plt.ylim() plt.ylim((0, ymax)) #Output all the results into plots plt.figure(plotInd) plt.hold(True) plotRealConfigError(graphStats.maxComponentSizeIndex, plotStyleBW[0], plotStyles4[0], "Max comp. vertices", "CM max comp. vertices") plotRealConfigError(graphStats.maxComponentEdgesIndex, plotStyleBW[1], plotStyles4[1], "Max comp. edges", "CM max comp. edges") plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("No. vertices/edges") plt.legend(loc="upper left") plt.savefig(figureDir + "MaxComponentSizeGrowth.eps") plotInd += 1 for k in range(len(dayList)): day = dayList[k] print(str(DateUtils.getDateStrFromDay(day, startYear)) + ": " + str(statsArray[k, graphStats.maxComponentEdgesIndex])) #print(str(DateUtils.getDateStrFromDay(day, startYear)) + ": " + str(configStatsArray[k, graphStats.numComponentsIndex])) plt.figure(plotInd) plotRealConfigError(graphStats.numComponentsIndex, plotStyleBW[0], plotStyles4[0], "Size >= 1", "CM size >= 1") plotRealConfigError(graphStats.numNonSingletonComponentsIndex, plotStyleBW[1], plotStyles4[1], "Size >= 2", "CM size >= 2") plotRealConfigError(graphStats.numTriOrMoreComponentsIndex, plotStyleBW[2], plotStyles4[2], "Size >= 3", "CM size >= 3") plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("No. components") plt.legend(loc="upper left") plt.savefig(figureDir + "NumComponentsGrowth.eps") plotInd += 1 plt.figure(plotInd) plotRealConfigError(graphStats.meanComponentSizeIndex, plotStyleBW[0], plotStyles4[0], "Real graph", "CM") plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("Mean component size") plt.legend(loc="lower right") plt.savefig(figureDir + "MeanComponentSizeGrowth.eps") plotInd += 1 plt.figure(plotInd) plotRealConfigError(graphStats.diameterIndex, plotStyleBW[0], plotStyles4[0], "Real graph", "CM") plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("Max component diameter") plt.legend(loc="lower right") plt.savefig(figureDir + "MaxComponentDiameterGrowth.eps") plotInd += 1 plt.figure(plotInd) plotRealConfigError(graphStats.effectiveDiameterIndex, plotStyleBW[0], plotStyles4[0], "Real graph", "CM") plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("Effective diameter") plt.legend(loc="lower right") plt.savefig(figureDir + "MaxComponentEffDiameterGrowth.eps") plotInd += 1 plt.figure(plotInd) plotRealConfigError(graphStats.meanDegreeIndex, plotStyleBW[0], plotStyles4[0], "All vertices", "CM all vertices") plotRealConfigError(graphStats.maxCompMeanDegreeIndex, plotStyleBW[1], plotStyles4[1], "Max component", "CM max component") #plt.plot(absDayList, statsArray[:, graphStats.meanDegreeIndex], plotStyleBW[0], absDayList, statsArray[:, graphStats.maxCompMeanDegreeIndex], plotStyleBW[1], absDayList, configStatsArray[:, graphStats.meanDegreeIndex], plotStyles4[0], absDayList, configStatsArray[:, graphStats.maxCompMeanDegreeIndex], plotStyles4[1]) plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("Mean degree") plt.legend(loc="lower right") plt.savefig(figureDir + "MeanDegrees.eps") plotInd += 1 plt.figure(plotInd) plotRealConfigError(graphStats.densityIndex, plotStyleBW[0], plotStyles4[0], "Real Graph", "Config Model") #plt.plot(absDayList, statsArray[:, graphStats.densityIndex], plotStyleBW[0], absDayList, configStatsArray[:, graphStats.densityIndex], plotStyles4[0]) plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("Density") plt.legend() plt.savefig(figureDir + "DensityGrowth.eps") plotInd += 1 plt.figure(plotInd) plt.plot(absDayList, statsArray[:, graphStats.powerLawIndex], plotStyleBW[0]) plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("Alpha") plt.savefig(figureDir + "PowerLawGrowth.eps") plotInd += 1 plt.figure(plotInd) plotRealConfigError(graphStats.geodesicDistanceIndex, plotStyleBW[0], plotStyles4[0], "Real Graph", "Config Model") #plt.plot(absDayList, statsArray[:, graphStats.geodesicDistanceIndex], plotStyleBW[0], absDayList, configStatsArray[:, graphStats.geodesicDistanceIndex], plotStyles4[0]) plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("Geodesic distance") plt.legend(loc="lower right") plt.savefig(figureDir + "GeodesicGrowth.eps") plotInd += 1 plt.figure(plotInd) plotRealConfigError(graphStats.harmonicGeoDistanceIndex, plotStyleBW[0], plotStyles4[0], "Real Graph", "Config Model") #plt.plot(absDayList, statsArray[:, graphStats.harmonicGeoDistanceIndex], plotStyleBW[0], absDayList, configStatsArray[:, graphStats.harmonicGeoDistanceIndex], plotStyles4[0]) plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("Mean harmonic geodesic distance") plt.legend(loc="upper right") plt.savefig(figureDir + "HarmonicGeodesicGrowth.eps") plotInd += 1 #print(statsArray[:, graphStats.harmonicGeoDistanceIndex]) plt.figure(plotInd) plotRealConfigError(graphStats.geodesicDistMaxCompIndex, plotStyleBW[0], plotStyles4[0], "Real graph", "Config model") #plt.plot(absDayList, statsArray[:, graphStats.geodesicDistMaxCompIndex], plotStyleBW[0], absDayList, configStatsArray[:, graphStats.geodesicDistMaxCompIndex], plotStyles4[0]) plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("Max component mean geodesic distance") plt.legend(loc="lower right") plt.savefig(figureDir + "MaxCompGeodesicGrowth.eps") plotInd += 1 #Find the number of edges in the infection graph resultsFileName = resultsDir + "InfectGrowthScalarStats.pkl" infectStatsArray = Util.loadPickle(resultsFileName) #Make sure we don't include 0 in the array vertexIndex = numpy.argmax(statsArray[:, graphStats.numVerticesIndex] > 0) edgeIndex = numpy.argmax(infectStatsArray[:, graphStats.numEdgesIndex] > 0) minIndex = numpy.maximum(vertexIndex, edgeIndex) plt.figure(plotInd) plt.plot(numpy.log(statsArray[minIndex:, graphStats.numVerticesIndex]), numpy.log(statsArray[minIndex:, graphStats.numEdgesIndex]), plotStyleBW[0]) plt.plot(numpy.log(infectStatsArray[minIndex:, graphStats.numVerticesIndex]), numpy.log(infectStatsArray[minIndex:, graphStats.numEdgesIndex]), plotStyleBW[1]) plt.plot(numpy.log(statsArray[minIndex:, graphStats.maxComponentSizeIndex]), numpy.log(statsArray[minIndex:, graphStats.maxComponentEdgesIndex]), plotStyleBW[2]) plt.xlabel("log(|V|)") plt.ylabel("log(|E|)/log(|D|)") plt.legend(("Contact graph", "Infection graph", "Max component"), loc="upper left") plt.savefig(figureDir + "LogVerticesEdgesGrowth.eps") plotInd += 1 results = statsArray[:, graphStats.effectiveDiameterIndex] results = numpy.c_[results, configStatsArray[:, graphStats.effectiveDiameterIndex]] results = numpy.c_[results, statsArray[:, graphStats.geodesicDistMaxCompIndex]] results = numpy.c_[results, configStatsArray[:, graphStats.geodesicDistMaxCompIndex]] configStatsArray print("\n\n") print(Latex.listToRow(["Diameter", "CM Diameter", "Mean Geodesic", "CM Mean Geodesic"])) print("\\hline") for i in range(0, len(dayList), 4): day = dayList[i] print(str(DateUtils.getDateStrFromDay(day, startYear)) + " & " + Latex.array1DToRow(results[i, :]) + "\\\\")
#Find the max number t for which we have a complete set of particles t = 0 for i in range(numEpsilons): thetaArray, objArray = loadThetaArray(N, resultsDir, i) if thetaArray.shape[0] == N: t = i logging.debug("Using particle number " + str(t)) times = numpy.arange(startDate, endDate+1, recordStep) realTheta, sigmaTheta, purtTheta = HIVModelUtils.toyTheta() thetaArray, objArray = loadThetaArray(N, resultsDir, t) thetas.append(thetaArray) print(thetaArray) resultsFileName = outputDir + "IdealStats.pkl" stats = Util.loadPickle(resultsFileName) vertexArrayIdeal, idealInfectedIndices, idealRemovedIndices, idealContactGraphStats, idealRemovedGraphStats, idealFinalRemovedDegrees = stats graphStats = GraphStatistics() idealMeasures[ind, numDetectsInd, :] = vertexArrayIdeal[:, numDetectsInd] idealMeasures[ind, maleInd, :] = vertexArrayIdeal[:, maleInd] idealMeasures[ind, femaleInd, :] = vertexArrayIdeal[:, femaleInd] idealMeasures[ind, heteroInd, :] = vertexArrayIdeal[:, heteroInd] idealMeasures[ind, biInd, :] = vertexArrayIdeal[:, biInd] idealMeasures[ind, randDetectInd, :] = vertexArrayIdeal[:, randDetectInd] idealMeasures[ind, contactDetectInd, :] = vertexArrayIdeal[:, contactDetectInd] idealMeasures[ind, numCompsInd, :] = idealRemovedGraphStats[:, graphStats.numComponentsIndex] idealMeasures[ind, maxCompSizeInd, :] = idealRemovedGraphStats[:, graphStats.maxComponentSizeIndex] idealMeasures[ind, numEdgesInd, :] = idealRemovedGraphStats[:, graphStats.numEdgesIndex] maxDegrees = min(idealFinalRemovedDegrees.shape[0], numDegrees)
meanAUCs[rowInd, i], stdAUCs[rowInd, i], fileNamesArray[rowInd][i] = meanAUC, stdAUC, fileName results += 1 except IOError as e: missingResults += 1 print("Found " + str(results) + ", missing " + str(missingResults) + " results\n") #Plot the best ROC curves for i in range(len(labelNames)): labelName = labelNames[i] ind = numpy.argmax(meanAUCs[:, i]) fileName = fileNamesArray[ind][i] print(fileName + " AUC= " + str(meanAUCs[ind, i])) fullFileName = resultsDir + fileName resultsDict = Util.loadPickle(fullFileName) allMetrics = resultsDict["allMetrics"] trainROCs = allMetrics[1] testROCs = allMetrics[3] for m in range(len(trainROCs)): plt.figure(plotInd) plt.title(fileName) plt.plot(trainROCs[m][0], trainROCs[m][1], "k") plt.plot(testROCs[m][0], testROCs[m][1], "r") plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.savefig(figureDir + labelName.replace(".", "_") + "-ROC.eps") plotInd += 1
def loadVectoriser(self): self.vectoriser = Util.loadPickle(self.vectoriserFilename) self.vectoriser.tokenizer = PorterTokeniser() self.authorList = Util.loadPickle(self.authorListFilename) self.citationList = Util.loadPickle(self.citationListFilename) logging.debug("Loaded vectoriser, citation and author list")