def next(self):
                X = self.XIterator.next()
                
                #Return the matrices P, Q as the learnt model
                if self.ZListSGD == None:
                    # assumption : training matrix centered by row and column
                    self.ZListSGD = self.baseLearner.learnModel(X, storeAll=False)
                else:
                    #In the case the matrix size changes, we alter P and Q to fit the new data     
                    P, Q = self.ZListSGD[0]
                                        
                    if X.shape[0] > P.shape[0]:
                        P = Util.extendArray(P, (X.shape[0], P.shape[1]))
                    elif X.shape[0] < P.shape[0]:
                        P = P[0:X.shape[0], :]

                    if X.shape[1] > Q.shape[0]:
                        Q = Util.extendArray(Q, (X.shape[1], Q.shape[1]))
                    elif X.shape[1] < Q.shape[0]:
                        Q = Q[0:X.shape[1], :]
                        
                    self.ZListSGD = [(P, Q)]
                    
                    try:
                        self.ZListSGD = self.baseLearner.learnModel(X, Z=self.ZListSGD, storeAll=False)
                    except FloatingPointError:
                        logging.warning("FloatingPointError encountered, reinitialise the matrix decomposition")
                        self.ZListSGD = self.baseLearner.learnModel(X, storeAll=False)
                    except ValueError:
                        logging.warning("ValueError encountered, reinitialise the matrix decomposition")
                        self.ZListSGD = self.baseLearner.learnModel(X, storeAll=False)
                    except SGDNorm2Reg.ArithmeticError:
                        logging.warning("ArithmeticError encountered, reinitialise the matrix decomposition")
                        self.ZListSGD = self.baseLearner.learnModel(X, storeAll=False)
                return self.ZListSGD
Example #2
0
 def createModel(targetGraph, startDate, endDate, recordStep, M, matchAlpha, breakSize, matchAlg, theta=None): 
     alpha = 2
     zeroVal = 0.9
     numpy.random.seed(21)
     
     graph = targetGraph.subgraph(targetGraph.removedIndsAt(startDate)) 
     graph.addVertices(M-graph.size)
     logging.debug("Created graph: " + str(graph))   
     
     p = Util.powerLawProbs(alpha, zeroVal)
     hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices())
     
     featureInds = numpy.ones(graph.vlist.getNumFeatures(), numpy.bool)
     featureInds[HIVVertices.dobIndex] = False 
     featureInds[HIVVertices.infectionTimeIndex] = False 
     featureInds[HIVVertices.hiddenDegreeIndex] = False 
     featureInds[HIVVertices.stateIndex] = False
     featureInds = numpy.arange(featureInds.shape[0])[featureInds]
     matcher = GraphMatch(matchAlg, alpha=matchAlpha, featureInds=featureInds, useWeightM=False)
     graphMetrics = HIVGraphMetrics2(targetGraph, breakSize, matcher, startDate)
     
     rates = HIVRates(graph, hiddenDegSeq)
     model = HIVEpidemicModel(graph, rates, T=float(endDate), T0=float(startDate), metrics=graphMetrics)
     model.setRecordStep(recordStep)
     if theta != None: 
         model.setParams(theta)
             
     return model 
Example #3
0
    def recordResults(self, clusterList, timeList, fileName):
        """
        Save results for a particular clustering
        """
        iterator = self.getIterator()
        measures = []
        graphInfo =  []
        logging.debug("Computing cluster measures")

        for i in range(len(clusterList)):
            Util.printIteration(i, self.logStep, len(clusterList))
            W = next(iterator)
            #G = networkx.Graph(W)
            #Store modularity, k-way normalised cut, and cluster size 
            currentMeasures = [GraphUtils.modularity(W, clusterList[i]), GraphUtils.kwayNormalisedCut(W, clusterList[i]), len(numpy.unique(clusterList[i]))] 
            measures.append(currentMeasures) 

            # graph size
            currentGraphInfo = [W.shape[0]]
            graphInfo.append(currentGraphInfo)
            # nb connected components
            #graphInfo[i, 1] = networkx.number_connected_components(G)
        
        measures = numpy.array(measures)
        graphInfo = numpy.array(graphInfo)
        
        numpy.savez(fileName, measures, timeList, graphInfo)
        logging.debug("Saved file as " + fileName)
Example #4
0
            def next(self):
                X = self.XIterator.next()

                #Return the matrices P, Q as the learnt model
                if self.ZListSGD == None:
                    # assumption : training matrix centered by row and column
                    self.ZListSGD = self.baseLearner.learnModel(X,
                                                                storeAll=False)
                else:
                    #In the case the matrix size changes, we alter P and Q to fit the new data
                    P, Q = self.ZListSGD[0]

                    if X.shape[0] > P.shape[0]:
                        P = Util.extendArray(P, (X.shape[0], P.shape[1]))
                    elif X.shape[0] < P.shape[0]:
                        P = P[0:X.shape[0], :]

                    if X.shape[1] > Q.shape[0]:
                        Q = Util.extendArray(Q, (X.shape[1], Q.shape[1]))
                    elif X.shape[1] < Q.shape[0]:
                        Q = Q[0:X.shape[1], :]

                    self.ZListSGD = [(P, Q)]

                    try:
                        self.ZListSGD = self.baseLearner.learnModel(
                            X, Z=self.ZListSGD, storeAll=False)
                    except FloatingPointError:
                        logging.warning(
                            "FloatingPointError encountered, reinitialise the matrix decomposition"
                        )
                        self.ZListSGD = self.baseLearner.learnModel(
                            X, storeAll=False)
                    except ValueError:
                        logging.warning(
                            "ValueError encountered, reinitialise the matrix decomposition"
                        )
                        self.ZListSGD = self.baseLearner.learnModel(
                            X, storeAll=False)
                    except SGDNorm2Reg.ArithmeticError:
                        logging.warning(
                            "ArithmeticError encountered, reinitialise the matrix decomposition"
                        )
                        self.ZListSGD = self.baseLearner.learnModel(
                            X, storeAll=False)
                return self.ZListSGD
Example #5
0
 def simulate(theta, startDate, endDate, recordStep, M, graphMetrics=None): 
     undirected = True
     graph = HIVGraph(M, undirected)
     logging.debug("Created graph: " + str(graph))
 
     alpha = 2
     zeroVal = 0.9
     p = Util.powerLawProbs(alpha, zeroVal)
     hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices())
 
     rates = HIVRates(graph, hiddenDegSeq)
     model = HIVEpidemicModel(graph, rates, endDate, startDate, metrics=graphMetrics)
     model.setRecordStep(recordStep)
     model.setParams(theta)
     
     logging.debug("Theta = " + str(theta))
     
     return model.simulate(True)
def runModel(theta, endDate=100.0, M=1000): 
    numpy.random.seed(21)
    undirected= True
    recordStep = 10 
    startDate = 0
    alpha = 2
    zeroVal = 0.9
    p = Util.powerLawProbs(alpha, zeroVal)
    graph = HIVGraph(M, undirected)
    hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices())
    logging.debug("MeanTheta=" + str(theta))
    
    rates = HIVRates(graph, hiddenDegSeq)
    model = HIVEpidemicModel(graph, rates, endDate, startDate)
    model.setRecordStep(recordStep)
    model.setParams(theta)
    
    times, infectedIndices, removedIndices, graph = model.simulate(True)            
    
    return times, infectedIndices, removedIndices, graph, model  
    def testSimulate2(self):    
        alpha = 2
        zeroVal = 0.9
        startDate = 0.0 
        endDate = 200.0
        M = 1000 
        undirected = True
        
        theta, sigmaTheta, pertTheta = HIVModelUtils.toyTheta()        
                
        
        numpy.random.seed(21)
        graph = HIVGraph(M, undirected)
        p = Util.powerLawProbs(alpha, zeroVal)
        hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices())
    
        rates = HIVRates(graph, hiddenDegSeq)
        model = HIVEpidemicModel(graph, rates, endDate, startDate, metrics=None)
        #model.setRecordStep(recordStep)
        model.setParams(theta)
        times, infectedIndices, removedIndices, graph =  model.simulate(True)
        
        numVertices = graph.size
        numEdges = graph.getNumEdges()
        
        #Try again 
        numpy.random.seed(21)
        graph = HIVGraph(M, undirected)
        p = Util.powerLawProbs(alpha, zeroVal)
        hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices())
    
        rates = HIVRates(graph, hiddenDegSeq)
        model = HIVEpidemicModel(graph, rates, endDate, startDate, metrics=None)
        model.setParams(theta)
        times, infectedIndices, removedIndices, graph =  model.simulate(True)
        
        numVertices2 = graph.size
        numEdges2 = graph.getNumEdges()

        self.assertEquals(numVertices2, numVertices)
        self.assertEquals(numEdges2, numEdges)
Example #8
0
    def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5):
        """
        Choose parameters based on a single matrix X. We do cross validation
        within, and set parameters according to the mean squared error.
        Return nothing.
        """
        logging.debug("Performing model selection")

        # usefull
        X = X.tocoo()
        gc.collect()
        nK = len(ks)
        nLmbda = len(lmbdas)
        nGamma = len(gammas)
        nLG = nLmbda * nGamma
        errors = scipy.zeros((nK, nLmbda, nGamma, nFolds))

        # generate cross validation sets
        cvInds = Sampling.randCrossValidation(nFolds, X.nnz)

        # compute error for each fold / setting
        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, nFolds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)

            assert trainX.nnz == trainInds.shape[0]
            assert testX.nnz == testInds.shape[0]
            nptst.assert_array_almost_equal((testX + trainX).data, X.data)

            paramList = []

            for ik, k in enumerate(ks):
                for ilmbda, lmbda in enumerate(lmbdas):
                    for igamma, gamma in enumerate(gammas):
                        paramList.append(
                            (trainX, testX, k, lmbda, gamma, maxNTry))

            # ! Remark !
            # we can parallelize the run of parameters easely.
            # parallelize the run of cv-folds is not done as it is much more
            # memory-consuming

            # parallel version (copied from IteraticeSoftImpute, but not tested)
            #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10)
            #results = pool.imap(self.learnPredict, paramList)
            #pool.terminate()

            # non-parallel version
            results = scipy.array(
                list(itertools.starmap(self.learnPredict, paramList)))

            errors[:, :, :, icv] = scipy.array(results).reshape(
                (nK, nLmbda, nGamma))

        # compute cross validation error for each setting
        errors[errors == float("inf")] = errors[errors != float("inf")].max()
        errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not(
            numpy.isnan(errors))])
        meanErrors = errors.mean(3)
        stdErrors = errors.std(3)
        logging.debug("Mean errors given (k, lambda, gamma):")
        logging.debug(meanErrors)
        logging.debug("... with standard deviation:")
        logging.debug(stdErrors)

        # keep the best
        iMin = meanErrors.argmin()
        kMin = ks[int(scipy.floor(iMin / (nLG)))]
        lmbdaMin = lmbdas[int(scipy.floor((iMin % nLG) / nGamma))]
        gammaMin = gammas[int(scipy.floor(iMin % nGamma))]
        logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " +
                      str(lmbdaMin) + ", " + str(gammaMin) + ")")
        logging.debug("min = " +
                      str(meanErrors[int(scipy.floor(iMin / (nLG))),
                                     int(scipy.floor((iMin % nLG) / nGamma)),
                                     int(scipy.floor(iMin % nGamma))]))

        self.baseLearner.k = kMin
        self.baseLearner.lmbda = lmbdaMin
        self.baseLearner.gamma = gammaMin

        return
Example #9
0
datasets = ModelSelectUtils.getRegressionDatasets(True)
gammas = numpy.unique(numpy.array(numpy.round(2 ** numpy.arange(1, 7.25, 0.25) - 1), dtype=numpy.int))


print(gammas)
# To use the betas in practice, pick the lowest value so far

for datasetName, numRealisations in datasets:
    try:
        A = numpy.load(outputDir + datasetName + "Beta.npz")["arr_0"]

        inds = gammas > 10
        tempGamma = numpy.sqrt(gammas[inds])
        tempA = A[inds, :]

        tempA = numpy.clip(tempA, 0, 1)

        plt.figure(0)
        plt.plot(tempGamma, Util.cumMin(tempA[:, 0]), label="50")
        plt.plot(tempGamma, Util.cumMin(tempA[:, 1]), label="100")
        plt.plot(tempGamma, Util.cumMin(tempA[:, 2]), label="200")
        plt.legend()
        plt.title(datasetName)
        plt.xlabel("gamma")
        plt.ylabel("Beta")

        plt.show()
    except:
        print("Dataset not found " + datasetName)
    def testSimulate2(self):    
        startDate = 0.0 
        endDate = 100.0 
        M = 1000 
        meanTheta, sigmaTheta = HIVModelUtils.estimatedRealTheta()
        
        undirected = True
        graph = HIVGraph(M, undirected)
        
        alpha = 2
        zeroVal = 0.9
        p = Util.powerLawProbs(alpha, zeroVal)
        hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices())
        
        meanTheta[4] = 0.1        
        
        recordStep = 10 
        printStep = 10
        rates = HIVRates(graph, hiddenDegSeq)
        model = HIVEpidemicModel(graph, rates, endDate, startDate)
        model.setRecordStep(recordStep)
        model.setPrintStep(printStep)
        model.setParams(meanTheta)
        
        initialInfected = graph.getInfectedSet()
        
        times, infectedIndices, removedIndices, graph = model.simulate(True)
        
        #Now test the final graph 
        edges = graph.getAllEdges()
        
        for i, j in edges:
            if graph.vlist.V[i, HIVVertices.genderIndex] == graph.vlist.V[j, HIVVertices.genderIndex] and (graph.vlist.V[i, HIVVertices.orientationIndex] != HIVVertices.bi or graph.vlist.V[j, HIVVertices.orientationIndex] != HIVVertices.bi): 
                self.fail()
                      
        finalInfected = graph.getInfectedSet()
        finalRemoved = graph.getRemovedSet()
        
        self.assertEquals(numpy.intersect1d(initialInfected, finalRemoved).shape[0], len(initialInfected))
        
        #Test case where there is no contact  
        meanTheta = numpy.array([100, 0.95, 1, 1, 0, 0, 0, 0, 0, 0, 0], numpy.float)
        
        times, infectedIndices, removedIndices, graph, model = runModel(meanTheta)

        self.assertEquals(len(graph.getInfectedSet()), 100)
        self.assertEquals(len(graph.getRemovedSet()), 0)
        self.assertEquals(graph.getNumEdges(), 0)
        
        heteroContactRate = 0.1
        meanTheta = numpy.array([100, 0.95, 1, 1, 0, 0, heteroContactRate, 0, 0, 0, 0], numpy.float)
        times, infectedIndices, removedIndices, graph, model = runModel(meanTheta)
        
        self.assertEquals(len(graph.getInfectedSet()), 100)
        self.assertEquals(len(graph.getRemovedSet()), 0)
        
        edges = graph.getAllEdges()
        
        for i, j in edges:
            self.assertNotEqual(graph.vlist.V[i, HIVVertices.genderIndex], graph.vlist.V[j, HIVVertices.genderIndex]) 
            
        #Number of conacts = rate*people*time
        infectedSet = graph.getInfectedSet()
        numHetero = (graph.vlist.V[list(infectedSet), HIVVertices.orientationIndex] == HIVVertices.hetero).sum()
        self.assertTrue(abs(numHetero*endDate*heteroContactRate- model.getNumContacts()) < 100)
        
        heteroContactRate = 0.01
        meanTheta = numpy.array([100, 0.95, 1, 1, 0, 0, heteroContactRate, 0, 0, 0, 0], numpy.float)
        times, infectedIndices, removedIndices, graph, model = runModel(meanTheta)
        infectedSet = graph.getInfectedSet()
        numHetero = (graph.vlist.V[list(infectedSet), HIVVertices.orientationIndex] == HIVVertices.hetero).sum()
        self.assertAlmostEqual(numHetero*endDate*heteroContactRate/100, model.getNumContacts()/100.0, 0)      
    def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5):
        """
        Choose parameters based on a single matrix X. We do cross validation
        within, and set parameters according to the mean squared error.
        Return nothing.
        """
        logging.debug("Performing model selection")

        # usefull
        X = X.tocoo()
        gc.collect()
        nK = len(ks) 
        nLmbda = len(lmbdas) 
        nGamma = len(gammas) 
        nLG = nLmbda * nGamma
        errors = scipy.zeros((nK, nLmbda, nGamma, nFolds))
       
        # generate cross validation sets
        cvInds = Sampling.randCrossValidation(nFolds, X.nnz)
        
        # compute error for each fold / setting
        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, nFolds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)

            assert trainX.nnz == trainInds.shape[0]
            assert testX.nnz == testInds.shape[0]
            nptst.assert_array_almost_equal((testX+trainX).data, X.data)

            paramList = []
        
            for ik, k in enumerate(ks):
                for ilmbda, lmbda in enumerate(lmbdas):
                    for igamma, gamma in enumerate(gammas):
                        paramList.append((trainX, testX, k, lmbda, gamma, maxNTry)) 
            
            # ! Remark !
            # we can parallelize the run of parameters easely.
            # parallelize the run of cv-folds is not done as it is much more
            # memory-consuming 
            
            # parallel version (copied from IteraticeSoftImpute, but not tested) 
            #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10)
            #results = pool.imap(self.learnPredict, paramList)
            #pool.terminate()

            # non-parallel version 
            results = scipy.array(list(itertools.starmap(self.learnPredict, paramList)))

            errors[:, :, :, icv] = scipy.array(results).reshape((nK, nLmbda, nGamma))
        
        # compute cross validation error for each setting
        errors[errors == float("inf")] = errors[errors != float("inf")].max()
        errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not(numpy.isnan(errors))])
        meanErrors = errors.mean(3)
        stdErrors = errors.std(3)
        logging.debug("Mean errors given (k, lambda, gamma):")
        logging.debug(meanErrors)
        logging.debug("... with standard deviation:")
        logging.debug(stdErrors)

        # keep the best
        iMin = meanErrors.argmin()
        kMin = ks[int(scipy.floor(iMin/(nLG)))]
        lmbdaMin = lmbdas[int(scipy.floor((iMin%nLG)/nGamma))]
        gammaMin = gammas[int(scipy.floor(iMin%nGamma))]
        logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " + str(lmbdaMin) + ", " + str(gammaMin) + ")")
        logging.debug("min = " + str(meanErrors[int(scipy.floor(iMin/(nLG))), int(scipy.floor((iMin%nLG)/nGamma)), int(scipy.floor(iMin%nGamma))]))
        
        self.baseLearner.k = kMin
        self.baseLearner.lmbda = lmbdaMin
        self.baseLearner.gamma = gammaMin
        
        return