def __generateRandomVertices(self, n): V = numpy.zeros((n, self.numFeatures)) V[:, self.dobIndex] = numpy.random.rand(n) V[:, self.genderIndex] = Util.randomChoice(numpy.array([1, 1]), n) #Note in reality females cannot be recorded as bisexual but we model the real scenario #We assume that 5% of the population is gay or bisexual V[:, self.orientationIndex] = Util.randomChoice(numpy.array([19, 1]), n) V[:, self.stateIndex] = numpy.zeros(n) V[:, self.infectionTimeIndex] = numpy.ones(n)*-1 V[:, self.detectionTimeIndex] = numpy.ones(n)*-1 V[:, self.detectionTypeIndex] = numpy.ones(n)*-1 V[:, self.hiddenDegreeIndex] = numpy.ones(n)*-1 return V
def simulateModel(theta): """ The parameter t is the particle index. """ logging.debug("theta=" + str(theta)) #We start with the observed graph at the start date graph = targetGraph.subgraph(targetGraph.removedIndsAt(startDate)) graph.addVertices(M-graph.size) p = Util.powerLawProbs(alpha, zeroVal) hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices()) featureInds = numpy.ones(graph.vlist.getNumFeatures(), numpy.bool) featureInds[HIVVertices.dobIndex] = False featureInds[HIVVertices.infectionTimeIndex] = False featureInds[HIVVertices.hiddenDegreeIndex] = False featureInds[HIVVertices.stateIndex] = False featureInds = numpy.arange(featureInds.shape[0])[featureInds] matcher = GraphMatch(matchAlg, alpha=matchAlpha, featureInds=featureInds, useWeightM=False) graphMetrics = HIVGraphMetrics2(targetGraph, breakSize, matcher, float(endDate)) recordStep = (endDate-startDate)/float(numRecordSteps) rates = HIVRates(graph, hiddenDegSeq) model = HIVEpidemicModel(graph, rates, T=float(endDate), T0=float(startDate), metrics=graphMetrics) model.setRecordStep(recordStep) model.setParams(theta) model.simulate() objective = model.objective() return objective
def growTree(self, X, y, argsortX, startId): """ Grow a tree using a stack. Give a sample of data and a node index, we find the best split and add children to the tree accordingly. We perform pre-pruning based on the penalty. """ eps = 10**-4 idStack = [startId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX) #Choose best feature based on gains accuracies += eps bestFeatureInd = Util.randomChoice(accuracies)[0] bestThreshold = thresholds[bestFeatureInd] nodeInds = node.getTrainInds() bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[ X[:, bestFeatureInd][nodeInds] < bestThreshold]]) bestRightInds = numpy.sort(nodeInds[numpy.arange( nodeInds.shape[0])[ X[:, bestFeatureInd][nodeInds] >= bestThreshold]]) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum( ) != 0 and self.tree.depth() < self.maxDepth: node.setError(1 - accuracies[bestFeatureInd]) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds])) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds])) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(rightChildId)
def findThetas(self, lastTheta, lastWeights, t): """ Find a theta to accept. """ tempTheta = self.abcParams.sampleParams() currentTheta, dists = self.loadThetas(t) while len(currentTheta) < self.N: paramList = [] for i in range(self.batchSize): if t == 0: tempTheta = self.abcParams.sampleParams() paramList.append((tempTheta.copy(), self.createModel, t, self.epsilonArray[t], self.N, self.thetaDir)) else: while True: if self.thetaUniformChoice: tempTheta = lastTheta[numpy.random.randint(self.N), :] else: tempTheta = lastTheta[Util.randomChoice(lastWeights)[0], :] tempTheta = self.abcParams.perturbationKernel(tempTheta, numpy.std(lastTheta, 0)/self.pertScale) if self.abcParams.priorDensity(tempTheta) != 0: break paramList.append((tempTheta.copy(), self.createModel, t, self.epsilonArray[t], self.N, self.thetaDir)) pool = multiprocessing.Pool(processes=self.numProcesses) resultsIterator = pool.map(runModel, paramList) #resultsIterator = map(runModel, paramList) for result in resultsIterator: self.numRuns[t] += result[0] self.numAccepts[t] += result[1] if self.numRuns[t] >= self.maxRuns: logging.debug("Maximum number of runs exceeded.") break currentTheta, dists = self.loadThetas(t) pool.terminate() if self.autoEpsilon and t!=self.T-1: self.epsilonArray[t+1] = numpy.mean(dists) logging.debug("Found new epsilon: " + str(self.epsilonArray[0:t+2])) logging.debug("Num accepts: " + str(self.numAccepts)) logging.debug("Num runs: " + str(self.numRuns)) logging.debug("Acceptance rate: " + str(self.numAccepts/(self.numRuns + numpy.array(self.numRuns==0, numpy.int)))) return currentTheta
def growTree(self, X, y, argsortX, startId): """ Grow a tree using a stack. Give a sample of data and a node index, we find the best split and add children to the tree accordingly. We perform pre-pruning based on the penalty. """ eps = 10**-4 idStack = [startId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX) #Choose best feature based on gains accuracies += eps bestFeatureInd = Util.randomChoice(accuracies)[0] bestThreshold = thresholds[bestFeatureInd] nodeInds = node.getTrainInds() bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]<bestThreshold]]) bestRightInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]>=bestThreshold]]) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0 and self.tree.depth() < self.maxDepth: node.setError(1-accuracies[bestFeatureInd]) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds])) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds])) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(rightChildId)
def setUp(self): numpy.seterr(invalid='raise') logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) numpy.set_printoptions(suppress=True, precision=4, linewidth=100) numpy.random.seed(21) M = 1000 undirected = True graph = HIVGraph(M, undirected) alpha = 2 zeroVal = 0.9 p = Util.powerLawProbs(alpha, zeroVal) hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices()) rates = HIVRates(graph, hiddenDegSeq) self.numParams = 6 self.graph = graph self.meanTheta = numpy.array([100, 0.9, 0.05, 0.001, 0.1, 0.005]) self.hivAbcParams = HIVABCParameters(self.meanTheta, self.meanTheta/2)
def testRandomChoice(self): v = numpy.array([0.25, 0.25, 0.25]) tol = 10**-2 c = numpy.zeros(3) numSamples = 500 for i in range(numSamples): j = Util.randomChoice(v) #logging.debug(j) c[j] += 1 self.assertTrue((c/numSamples == numpy.array([0.33, 0.33, 0.33])).all() < tol) v = v * 20 c = numpy.zeros(3) for i in range(numSamples): j = Util.randomChoice(v) #logging.debug(j) c[j] += 1 self.assertTrue((c/numSamples == numpy.array([0.33, 0.33, 0.33])).all() < tol) #Now try different distribution v = numpy.array([0.2, 0.6, 0.2]) c = numpy.zeros(3) for i in range(numSamples): j = Util.randomChoice(v) #logging.debug(j) c[j] += 1 self.assertTrue((c/numSamples == v).all() < tol) #Test empty vector v = numpy.array([]) self.assertEquals(Util.randomChoice(v), -1) #Test case where we want multiple random choices n = 1000 v = numpy.array([0.2, 0.6, 0.2]) j = Util.randomChoice(v, n) self.assertEquals(j.shape[0], n) self.assertAlmostEquals(numpy.sum(j==0)/float(n), v[0], places=1) self.assertAlmostEquals(numpy.sum(j==1)/float(n), v[1], places=1) #Now test the 2D case n = 2000 V = numpy.array([[0.1, 0.3, 0.6], [0.6, 0.3, 0.1]]) J = Util.randomChoice(V, n) self.assertEquals(J.shape[0], V.shape[0]) self.assertEquals(J.shape[1], n) self.assertAlmostEquals(numpy.sum(J[0, :]==0)/float(n), V[0, 0], places=1) self.assertAlmostEquals(numpy.sum(J[0, :]==1)/float(n), V[0, 1], places=1) self.assertAlmostEquals(numpy.sum(J[0, :]==2)/float(n), V[0, 2], places=1) self.assertAlmostEquals(numpy.sum(J[1, :]==0)/float(n), V[1, 0], places=1) self.assertAlmostEquals(numpy.sum(J[1, :]==1)/float(n), V[1, 1], places=1) self.assertAlmostEquals(numpy.sum(J[1, :]==2)/float(n), V[1, 2], places=1)
def runRandom2Choice(): reps = 100 for i in range(reps): Util.randomChoice(V, m)
def findThetas(self, lastTheta, lastWeights, t): """ Find a theta to accept. """ tempTheta = self.abcParams.sampleParams() currentTheta, dists = self.loadThetas(t) while len(currentTheta) < self.N: paramList = [] for i in range(self.batchSize): if t == 0: tempTheta = self.abcParams.sampleParams() paramList.append( (tempTheta.copy(), self.createModel, t, self.epsilonArray[t], self.N, self.thetaDir)) else: while True: if self.thetaUniformChoice: tempTheta = lastTheta[ numpy.random.randint(self.N), :] else: tempTheta = lastTheta[ Util.randomChoice(lastWeights)[0], :] tempTheta = self.abcParams.perturbationKernel( tempTheta, numpy.std(lastTheta, 0) / self.pertScale) if self.abcParams.priorDensity(tempTheta) != 0: break paramList.append( (tempTheta.copy(), self.createModel, t, self.epsilonArray[t], self.N, self.thetaDir)) pool = multiprocessing.Pool(processes=self.numProcesses) resultsIterator = pool.map(runModel, paramList) #resultsIterator = map(runModel, paramList) for result in resultsIterator: self.numRuns[t] += result[0] self.numAccepts[t] += result[1] if self.numRuns[t] >= self.maxRuns: logging.debug("Maximum number of runs exceeded.") break currentTheta, dists = self.loadThetas(t) pool.terminate() if self.autoEpsilon and t != self.T - 1: self.epsilonArray[t + 1] = numpy.mean(dists) logging.debug("Found new epsilon: " + str(self.epsilonArray[0:t + 2])) logging.debug("Num accepts: " + str(self.numAccepts)) logging.debug("Num runs: " + str(self.numRuns)) logging.debug( "Acceptance rate: " + str(self.numAccepts / (self.numRuns + numpy.array(self.numRuns == 0, numpy.int)))) return currentTheta
def testRandomChoice(self): v = numpy.array([0.25, 0.25, 0.25]) tol = 10**-2 c = numpy.zeros(3) numSamples = 500 for i in range(numSamples): j = Util.randomChoice(v) #logging.debug(j) c[j] += 1 self.assertTrue( (c / numSamples == numpy.array([0.33, 0.33, 0.33])).all() < tol) v = v * 20 c = numpy.zeros(3) for i in range(numSamples): j = Util.randomChoice(v) #logging.debug(j) c[j] += 1 self.assertTrue( (c / numSamples == numpy.array([0.33, 0.33, 0.33])).all() < tol) #Now try different distribution v = numpy.array([0.2, 0.6, 0.2]) c = numpy.zeros(3) for i in range(numSamples): j = Util.randomChoice(v) #logging.debug(j) c[j] += 1 self.assertTrue((c / numSamples == v).all() < tol) #Test empty vector v = numpy.array([]) self.assertEquals(Util.randomChoice(v), -1) #Test case where we want multiple random choices n = 1000 v = numpy.array([0.2, 0.6, 0.2]) j = Util.randomChoice(v, n) self.assertEquals(j.shape[0], n) self.assertAlmostEquals(numpy.sum(j == 0) / float(n), v[0], places=1) self.assertAlmostEquals(numpy.sum(j == 1) / float(n), v[1], places=1) #Now test the 2D case n = 2000 V = numpy.array([[0.1, 0.3, 0.6], [0.6, 0.3, 0.1]]) J = Util.randomChoice(V, n) self.assertEquals(J.shape[0], V.shape[0]) self.assertEquals(J.shape[1], n) self.assertAlmostEquals(numpy.sum(J[0, :] == 0) / float(n), V[0, 0], places=1) self.assertAlmostEquals(numpy.sum(J[0, :] == 1) / float(n), V[0, 1], places=1) self.assertAlmostEquals(numpy.sum(J[0, :] == 2) / float(n), V[0, 2], places=1) self.assertAlmostEquals(numpy.sum(J[1, :] == 0) / float(n), V[1, 0], places=1) self.assertAlmostEquals(numpy.sum(J[1, :] == 1) / float(n), V[1, 1], places=1) self.assertAlmostEquals(numpy.sum(J[1, :] == 2) / float(n), V[1, 2], places=1)