def next(self): X = self.XIterator.next() #Return the matrices P, Q as the learnt model if self.ZListSGD == None: # assumption : training matrix centered by row and column self.ZListSGD = self.baseLearner.learnModel(X, storeAll=False) else: #In the case the matrix size changes, we alter P and Q to fit the new data P, Q = self.ZListSGD[0] if X.shape[0] > P.shape[0]: P = Util.extendArray(P, (X.shape[0], P.shape[1])) elif X.shape[0] < P.shape[0]: P = P[0:X.shape[0], :] if X.shape[1] > Q.shape[0]: Q = Util.extendArray(Q, (X.shape[1], Q.shape[1])) elif X.shape[1] < Q.shape[0]: Q = Q[0:X.shape[1], :] self.ZListSGD = [(P, Q)] try: self.ZListSGD = self.baseLearner.learnModel(X, Z=self.ZListSGD, storeAll=False) except FloatingPointError: logging.warning("FloatingPointError encountered, reinitialise the matrix decomposition") self.ZListSGD = self.baseLearner.learnModel(X, storeAll=False) except ValueError: logging.warning("ValueError encountered, reinitialise the matrix decomposition") self.ZListSGD = self.baseLearner.learnModel(X, storeAll=False) except SGDNorm2Reg.ArithmeticError: logging.warning("ArithmeticError encountered, reinitialise the matrix decomposition") self.ZListSGD = self.baseLearner.learnModel(X, storeAll=False) return self.ZListSGD
def createModel(targetGraph, startDate, endDate, recordStep, M, matchAlpha, breakSize, matchAlg, theta=None): alpha = 2 zeroVal = 0.9 numpy.random.seed(21) graph = targetGraph.subgraph(targetGraph.removedIndsAt(startDate)) graph.addVertices(M-graph.size) logging.debug("Created graph: " + str(graph)) p = Util.powerLawProbs(alpha, zeroVal) hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices()) featureInds = numpy.ones(graph.vlist.getNumFeatures(), numpy.bool) featureInds[HIVVertices.dobIndex] = False featureInds[HIVVertices.infectionTimeIndex] = False featureInds[HIVVertices.hiddenDegreeIndex] = False featureInds[HIVVertices.stateIndex] = False featureInds = numpy.arange(featureInds.shape[0])[featureInds] matcher = GraphMatch(matchAlg, alpha=matchAlpha, featureInds=featureInds, useWeightM=False) graphMetrics = HIVGraphMetrics2(targetGraph, breakSize, matcher, startDate) rates = HIVRates(graph, hiddenDegSeq) model = HIVEpidemicModel(graph, rates, T=float(endDate), T0=float(startDate), metrics=graphMetrics) model.setRecordStep(recordStep) if theta != None: model.setParams(theta) return model
def recordResults(self, clusterList, timeList, fileName): """ Save results for a particular clustering """ iterator = self.getIterator() measures = [] graphInfo = [] logging.debug("Computing cluster measures") for i in range(len(clusterList)): Util.printIteration(i, self.logStep, len(clusterList)) W = next(iterator) #G = networkx.Graph(W) #Store modularity, k-way normalised cut, and cluster size currentMeasures = [GraphUtils.modularity(W, clusterList[i]), GraphUtils.kwayNormalisedCut(W, clusterList[i]), len(numpy.unique(clusterList[i]))] measures.append(currentMeasures) # graph size currentGraphInfo = [W.shape[0]] graphInfo.append(currentGraphInfo) # nb connected components #graphInfo[i, 1] = networkx.number_connected_components(G) measures = numpy.array(measures) graphInfo = numpy.array(graphInfo) numpy.savez(fileName, measures, timeList, graphInfo) logging.debug("Saved file as " + fileName)
def next(self): X = self.XIterator.next() #Return the matrices P, Q as the learnt model if self.ZListSGD == None: # assumption : training matrix centered by row and column self.ZListSGD = self.baseLearner.learnModel(X, storeAll=False) else: #In the case the matrix size changes, we alter P and Q to fit the new data P, Q = self.ZListSGD[0] if X.shape[0] > P.shape[0]: P = Util.extendArray(P, (X.shape[0], P.shape[1])) elif X.shape[0] < P.shape[0]: P = P[0:X.shape[0], :] if X.shape[1] > Q.shape[0]: Q = Util.extendArray(Q, (X.shape[1], Q.shape[1])) elif X.shape[1] < Q.shape[0]: Q = Q[0:X.shape[1], :] self.ZListSGD = [(P, Q)] try: self.ZListSGD = self.baseLearner.learnModel( X, Z=self.ZListSGD, storeAll=False) except FloatingPointError: logging.warning( "FloatingPointError encountered, reinitialise the matrix decomposition" ) self.ZListSGD = self.baseLearner.learnModel( X, storeAll=False) except ValueError: logging.warning( "ValueError encountered, reinitialise the matrix decomposition" ) self.ZListSGD = self.baseLearner.learnModel( X, storeAll=False) except SGDNorm2Reg.ArithmeticError: logging.warning( "ArithmeticError encountered, reinitialise the matrix decomposition" ) self.ZListSGD = self.baseLearner.learnModel( X, storeAll=False) return self.ZListSGD
def simulate(theta, startDate, endDate, recordStep, M, graphMetrics=None): undirected = True graph = HIVGraph(M, undirected) logging.debug("Created graph: " + str(graph)) alpha = 2 zeroVal = 0.9 p = Util.powerLawProbs(alpha, zeroVal) hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices()) rates = HIVRates(graph, hiddenDegSeq) model = HIVEpidemicModel(graph, rates, endDate, startDate, metrics=graphMetrics) model.setRecordStep(recordStep) model.setParams(theta) logging.debug("Theta = " + str(theta)) return model.simulate(True)
def runModel(theta, endDate=100.0, M=1000): numpy.random.seed(21) undirected= True recordStep = 10 startDate = 0 alpha = 2 zeroVal = 0.9 p = Util.powerLawProbs(alpha, zeroVal) graph = HIVGraph(M, undirected) hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices()) logging.debug("MeanTheta=" + str(theta)) rates = HIVRates(graph, hiddenDegSeq) model = HIVEpidemicModel(graph, rates, endDate, startDate) model.setRecordStep(recordStep) model.setParams(theta) times, infectedIndices, removedIndices, graph = model.simulate(True) return times, infectedIndices, removedIndices, graph, model
def testSimulate2(self): alpha = 2 zeroVal = 0.9 startDate = 0.0 endDate = 200.0 M = 1000 undirected = True theta, sigmaTheta, pertTheta = HIVModelUtils.toyTheta() numpy.random.seed(21) graph = HIVGraph(M, undirected) p = Util.powerLawProbs(alpha, zeroVal) hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices()) rates = HIVRates(graph, hiddenDegSeq) model = HIVEpidemicModel(graph, rates, endDate, startDate, metrics=None) #model.setRecordStep(recordStep) model.setParams(theta) times, infectedIndices, removedIndices, graph = model.simulate(True) numVertices = graph.size numEdges = graph.getNumEdges() #Try again numpy.random.seed(21) graph = HIVGraph(M, undirected) p = Util.powerLawProbs(alpha, zeroVal) hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices()) rates = HIVRates(graph, hiddenDegSeq) model = HIVEpidemicModel(graph, rates, endDate, startDate, metrics=None) model.setParams(theta) times, infectedIndices, removedIndices, graph = model.simulate(True) numVertices2 = graph.size numEdges2 = graph.getNumEdges() self.assertEquals(numVertices2, numVertices) self.assertEquals(numEdges2, numEdges)
def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5): """ Choose parameters based on a single matrix X. We do cross validation within, and set parameters according to the mean squared error. Return nothing. """ logging.debug("Performing model selection") # usefull X = X.tocoo() gc.collect() nK = len(ks) nLmbda = len(lmbdas) nGamma = len(gammas) nLG = nLmbda * nGamma errors = scipy.zeros((nK, nLmbda, nGamma, nFolds)) # generate cross validation sets cvInds = Sampling.randCrossValidation(nFolds, X.nnz) # compute error for each fold / setting for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, nFolds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] nptst.assert_array_almost_equal((testX + trainX).data, X.data) paramList = [] for ik, k in enumerate(ks): for ilmbda, lmbda in enumerate(lmbdas): for igamma, gamma in enumerate(gammas): paramList.append( (trainX, testX, k, lmbda, gamma, maxNTry)) # ! Remark ! # we can parallelize the run of parameters easely. # parallelize the run of cv-folds is not done as it is much more # memory-consuming # parallel version (copied from IteraticeSoftImpute, but not tested) #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10) #results = pool.imap(self.learnPredict, paramList) #pool.terminate() # non-parallel version results = scipy.array( list(itertools.starmap(self.learnPredict, paramList))) errors[:, :, :, icv] = scipy.array(results).reshape( (nK, nLmbda, nGamma)) # compute cross validation error for each setting errors[errors == float("inf")] = errors[errors != float("inf")].max() errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not( numpy.isnan(errors))]) meanErrors = errors.mean(3) stdErrors = errors.std(3) logging.debug("Mean errors given (k, lambda, gamma):") logging.debug(meanErrors) logging.debug("... with standard deviation:") logging.debug(stdErrors) # keep the best iMin = meanErrors.argmin() kMin = ks[int(scipy.floor(iMin / (nLG)))] lmbdaMin = lmbdas[int(scipy.floor((iMin % nLG) / nGamma))] gammaMin = gammas[int(scipy.floor(iMin % nGamma))] logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " + str(lmbdaMin) + ", " + str(gammaMin) + ")") logging.debug("min = " + str(meanErrors[int(scipy.floor(iMin / (nLG))), int(scipy.floor((iMin % nLG) / nGamma)), int(scipy.floor(iMin % nGamma))])) self.baseLearner.k = kMin self.baseLearner.lmbda = lmbdaMin self.baseLearner.gamma = gammaMin return
datasets = ModelSelectUtils.getRegressionDatasets(True) gammas = numpy.unique(numpy.array(numpy.round(2 ** numpy.arange(1, 7.25, 0.25) - 1), dtype=numpy.int)) print(gammas) # To use the betas in practice, pick the lowest value so far for datasetName, numRealisations in datasets: try: A = numpy.load(outputDir + datasetName + "Beta.npz")["arr_0"] inds = gammas > 10 tempGamma = numpy.sqrt(gammas[inds]) tempA = A[inds, :] tempA = numpy.clip(tempA, 0, 1) plt.figure(0) plt.plot(tempGamma, Util.cumMin(tempA[:, 0]), label="50") plt.plot(tempGamma, Util.cumMin(tempA[:, 1]), label="100") plt.plot(tempGamma, Util.cumMin(tempA[:, 2]), label="200") plt.legend() plt.title(datasetName) plt.xlabel("gamma") plt.ylabel("Beta") plt.show() except: print("Dataset not found " + datasetName)
def testSimulate2(self): startDate = 0.0 endDate = 100.0 M = 1000 meanTheta, sigmaTheta = HIVModelUtils.estimatedRealTheta() undirected = True graph = HIVGraph(M, undirected) alpha = 2 zeroVal = 0.9 p = Util.powerLawProbs(alpha, zeroVal) hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices()) meanTheta[4] = 0.1 recordStep = 10 printStep = 10 rates = HIVRates(graph, hiddenDegSeq) model = HIVEpidemicModel(graph, rates, endDate, startDate) model.setRecordStep(recordStep) model.setPrintStep(printStep) model.setParams(meanTheta) initialInfected = graph.getInfectedSet() times, infectedIndices, removedIndices, graph = model.simulate(True) #Now test the final graph edges = graph.getAllEdges() for i, j in edges: if graph.vlist.V[i, HIVVertices.genderIndex] == graph.vlist.V[j, HIVVertices.genderIndex] and (graph.vlist.V[i, HIVVertices.orientationIndex] != HIVVertices.bi or graph.vlist.V[j, HIVVertices.orientationIndex] != HIVVertices.bi): self.fail() finalInfected = graph.getInfectedSet() finalRemoved = graph.getRemovedSet() self.assertEquals(numpy.intersect1d(initialInfected, finalRemoved).shape[0], len(initialInfected)) #Test case where there is no contact meanTheta = numpy.array([100, 0.95, 1, 1, 0, 0, 0, 0, 0, 0, 0], numpy.float) times, infectedIndices, removedIndices, graph, model = runModel(meanTheta) self.assertEquals(len(graph.getInfectedSet()), 100) self.assertEquals(len(graph.getRemovedSet()), 0) self.assertEquals(graph.getNumEdges(), 0) heteroContactRate = 0.1 meanTheta = numpy.array([100, 0.95, 1, 1, 0, 0, heteroContactRate, 0, 0, 0, 0], numpy.float) times, infectedIndices, removedIndices, graph, model = runModel(meanTheta) self.assertEquals(len(graph.getInfectedSet()), 100) self.assertEquals(len(graph.getRemovedSet()), 0) edges = graph.getAllEdges() for i, j in edges: self.assertNotEqual(graph.vlist.V[i, HIVVertices.genderIndex], graph.vlist.V[j, HIVVertices.genderIndex]) #Number of conacts = rate*people*time infectedSet = graph.getInfectedSet() numHetero = (graph.vlist.V[list(infectedSet), HIVVertices.orientationIndex] == HIVVertices.hetero).sum() self.assertTrue(abs(numHetero*endDate*heteroContactRate- model.getNumContacts()) < 100) heteroContactRate = 0.01 meanTheta = numpy.array([100, 0.95, 1, 1, 0, 0, heteroContactRate, 0, 0, 0, 0], numpy.float) times, infectedIndices, removedIndices, graph, model = runModel(meanTheta) infectedSet = graph.getInfectedSet() numHetero = (graph.vlist.V[list(infectedSet), HIVVertices.orientationIndex] == HIVVertices.hetero).sum() self.assertAlmostEqual(numHetero*endDate*heteroContactRate/100, model.getNumContacts()/100.0, 0)
def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5): """ Choose parameters based on a single matrix X. We do cross validation within, and set parameters according to the mean squared error. Return nothing. """ logging.debug("Performing model selection") # usefull X = X.tocoo() gc.collect() nK = len(ks) nLmbda = len(lmbdas) nGamma = len(gammas) nLG = nLmbda * nGamma errors = scipy.zeros((nK, nLmbda, nGamma, nFolds)) # generate cross validation sets cvInds = Sampling.randCrossValidation(nFolds, X.nnz) # compute error for each fold / setting for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, nFolds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] nptst.assert_array_almost_equal((testX+trainX).data, X.data) paramList = [] for ik, k in enumerate(ks): for ilmbda, lmbda in enumerate(lmbdas): for igamma, gamma in enumerate(gammas): paramList.append((trainX, testX, k, lmbda, gamma, maxNTry)) # ! Remark ! # we can parallelize the run of parameters easely. # parallelize the run of cv-folds is not done as it is much more # memory-consuming # parallel version (copied from IteraticeSoftImpute, but not tested) #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10) #results = pool.imap(self.learnPredict, paramList) #pool.terminate() # non-parallel version results = scipy.array(list(itertools.starmap(self.learnPredict, paramList))) errors[:, :, :, icv] = scipy.array(results).reshape((nK, nLmbda, nGamma)) # compute cross validation error for each setting errors[errors == float("inf")] = errors[errors != float("inf")].max() errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not(numpy.isnan(errors))]) meanErrors = errors.mean(3) stdErrors = errors.std(3) logging.debug("Mean errors given (k, lambda, gamma):") logging.debug(meanErrors) logging.debug("... with standard deviation:") logging.debug(stdErrors) # keep the best iMin = meanErrors.argmin() kMin = ks[int(scipy.floor(iMin/(nLG)))] lmbdaMin = lmbdas[int(scipy.floor((iMin%nLG)/nGamma))] gammaMin = gammas[int(scipy.floor(iMin%nGamma))] logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " + str(lmbdaMin) + ", " + str(gammaMin) + ")") logging.debug("min = " + str(meanErrors[int(scipy.floor(iMin/(nLG))), int(scipy.floor((iMin%nLG)/nGamma)), int(scipy.floor(iMin%nGamma))])) self.baseLearner.k = kMin self.baseLearner.lmbda = lmbdaMin self.baseLearner.gamma = gammaMin return