def testGraphFromMatFile(self): matFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoAlterTransmissions1000.mat" sGraph = EgoUtils.graphFromMatFile(matFileName) examplesList = ExamplesList.readFromMatFile(matFileName) numFeatures = examplesList.getDataFieldSize("X", 1) self.assertEquals(examplesList.getNumExamples(), sGraph.getNumEdges()) self.assertEquals(examplesList.getNumExamples()*2, sGraph.getNumVertices()) self.assertEquals(numFeatures/2+1, sGraph.getVertexList().getNumFeatures()) #Every even vertex has information, odd does not for i in range(0, sGraph.getNumVertices()): vertex = sGraph.getVertex(i) if i%2 == 0: self.assertEquals(vertex[sGraph.getVertexList().getNumFeatures()-1], 1) else: self.assertEquals(vertex[sGraph.getVertexList().getNumFeatures()-1], 0) #Test the first few vertices are the same for i in range(0, 10): vertex1 = sGraph.getVertex(i*2)[0:numFeatures/2] vertex2 = sGraph.getVertex(i*2+1)[0:numFeatures/2] vertexEx1 = examplesList.getSubDataField("X", numpy.array([i])).ravel()[0:numFeatures/2] vertexEx2 = examplesList.getSubDataField("X", numpy.array([i])).ravel()[numFeatures/2:numFeatures] self.assertTrue((vertex1 == vertexEx1).all()) self.assertTrue((vertex2 == vertexEx2).all())
def setUp(self): numVertices = 500 numFeatures = 49 self.means = rand.randn(numFeatures) self.vars = rand.randn(numFeatures, numFeatures) self.vars = self.vars + self.vars.T #Make vars symmetric p1 = 0.1 self.egoGenerator = EgoGenerator() vList = self.egoGenerator.generateIndicatorVertices(numVertices, self.means, self.vars, p1) sGraph = SparseGraph(vList) p2 = 0.1 k = 5 #Create the graph edges according to the small world model graphGen = SmallWorldGenerator(p2, k) self.sGraph = graphGen.generate(sGraph) dataDir = PathDefaults.getDataDir() + "infoDiffusion/" matFileName = dataDir + "EgoAlterTransmissions1000.mat" sampleSize = 100 egoAlterExamplesList = ExamplesList.readFromMatFile(matFileName) egoAlterExamplesList.setDefaultExamplesName("X") egoAlterExamplesList.setLabelsName("y") egoAlterExamplesList.randomSubData(sampleSize) X = egoAlterExamplesList.getDataField("X") y = egoAlterExamplesList.getDataField("y") #Now learn using NaiveBayes self.nb = NaiveBayes() self.nb.learnModel(X, y) self.egoSimulator = EgoSimulator(self.sGraph, self.nb) #Define a classifier which predicts transfer if gender is female class DummyClassifier(object): def __init(self): pass def classify(self, X): y = numpy.zeros((X.shape[0])) for i in range(X.shape[0]): if X[i, 0] == 0: y[i] = 1 else: y[i] = -1 return y self.dc = DummyClassifier()
def readFiles(self, egoFileName, alterFileName, missing=0): (egoArray, egoTitles) = self.readFile(egoFileName, self.egoQuestionIds, missing) (alterArray, alterTitles) = self.readFile(alterFileName, self.alterQuestionIds, missing) #Augment receivers with new information egoAlterQuestionIds = self.__getAlterQuestionIds() alterFieldIndices = self.getAlterFieldIndices() (egoAlterArray, egoAlterTitles) = self.readFile(egoFileName, egoAlterQuestionIds, missing) (receiversArray, egoIndicesR, alterIndices) = self.generateReceivers(egoAlterArray, alterArray, alterFieldIndices) #Make sure we count receivers for all egos receiverCounts = numpy.zeros(egoArray.shape[0], numpy.int) if egoIndicesR.shape[0] !=0: binCount = numpy.bincount(egoIndicesR) else: binCount = numpy.array([]) receiverCounts[0:binCount.shape[0]] = binCount #Generate non-receivers numContactsIndices = [self.numFriendsIndex, self.numColleaguesIndex, self.numFamilyIndex, self.numAquantancesIndex] homophileIndexPairs = [(self.homophileAgeIndex, self.ageIndex), (self.homophileGenderIndex, self.genderIndex)] homophileIndexPairs.extend([(self.homophileEducationIndex, self.educationIndex), (self.homophileIncomeIndex, self.incomeIndex)]) (nonReceiversArray, egoIndicesNR, alterIndicesNR) = self.generateNonReceivers(egoArray, numContactsIndices, homophileIndexPairs, receiverCounts) #Now, we generate all pairs of senders/non-senders and receivers/non-receivers numExamples = nonReceiversArray.shape[0] + receiversArray.shape[0] numPersonFeatures = egoArray.shape[1] numFeatures = numPersonFeatures*2 X = numpy.zeros((numExamples, numFeatures)) y = numpy.zeros(numExamples, numpy.int32) for i in range(0, numExamples): if i < nonReceiversArray.shape[0]: X[i, 0:numPersonFeatures] = egoArray[egoIndicesNR[i], :] X[i, numPersonFeatures:numFeatures] = nonReceiversArray[i, :] y[i] = -1 else: j = i - nonReceiversArray.shape[0] X[i, 0:numPersonFeatures] = egoArray[egoIndicesR[j], :] X[i, numPersonFeatures:numFeatures] = receiversArray[j, :] y[i] = 1 examplesList = ExamplesList(numExamples) examplesList.addDataField("X", X) examplesList.addDataField("y", y) examplesList.setDefaultExamplesName("X") examplesList.setLabelsName("y") return examplesList, egoIndicesR, alterIndices, egoIndicesNR, alterIndicesNR
def checkDistributions(): matFileName = "../../data/EgoAlterTransmissions.mat" examplesList = ExamplesList.readFromMatFile(matFileName) numFeatures = examplesList.getDataFieldSize("X", 1) X = examplesList.getDataField("X")[:, 0:numFeatures/2] Z = examplesList.getDataField("X")[:, numFeatures/2:numFeatures] y = examplesList.getDataField("y") A = Z[y==-1, :] #Now load directly from the CSV file #Learn the distribution of the egos eCsvReader = EgoCsvReader() egoFileName = "../../data/EgoData.csv" alterFileName = "../../data/AlterData.csv" egoQuestionIds = eCsvReader.getEgoQuestionIds() alterQuestionIds = eCsvReader.getAlterQuestionIds() (X2, titles) = eCsvReader.readFile(egoFileName, egoQuestionIds) X2[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(X2[:, eCsvReader.ageIndex]) (mu, sigma) = Util.computeMeanVar(X) (mu2, sigma2) = Util.computeMeanVar(X2) (mu3, sigma3) = Util.computeMeanVar(Z) (mu4, sigma4) = Util.computeMeanVar(A) #Seems okay. Next check alters print(("Mean " + str(mu - mu4))) print(("Variance " + str(numpy.diag(sigma - sigma4)))) """ Analysis between the Egos in EgoData.csv and those in EgoAlterTransmissions.mat reveals that the distributions match closely. The main differences are in the means and variances in Q44A - D, but this isn't too suprising. """ """
def testAdvanceGraph3(self): """ This test will learn from a set of ego and alter pairs, then we will make predictions on the pairs and see the results. The we test if the same results are present in a simulation. """ dataDir = PathDefaults.getDataDir() + "infoDiffusion/" matFileName = dataDir + "EgoAlterTransmissions1000.mat" examplesList = ExamplesList.readFromMatFile(matFileName) examplesList.setDefaultExamplesName("X") examplesList.setLabelsName("y") logging.debug(("Number of y = +1: " + str(sum(examplesList.getSampledDataField("y") == 1)))) logging.debug(("Number of y = -1: " + str(sum(examplesList.getSampledDataField("y") == -1)))) #Standardise the examples preprocessor = Standardiser() X = examplesList.getDataField(examplesList.getDefaultExamplesName()) X = preprocessor.standardiseArray(X) examplesList.overwriteDataField(examplesList.getDefaultExamplesName(), X) classifier = MlpySVM(kernel='linear', kp=1, C=32.0) y = examplesList.getDataField("y") classifier.learnModel(X, y) predY = classifier.classify(X) logging.debug(("Number of y = +1: " + str(sum(examplesList.getSampledDataField("y") == 1)))) logging.debug(("Number of y = -1: " + str(sum(examplesList.getSampledDataField("y") == -1)))) sampledY = examplesList.getSampledDataField(examplesList.getLabelsName()).ravel() error = mlpy.err(sampledY, predY) sensitivity = mlpy.sens(sampledY, predY) specificity = mlpy.spec(sampledY, predY) errorP = mlpy.errp(sampledY, predY) errorN = mlpy.errn(sampledY, predY) logging.debug("--- Classification evaluation ---") logging.debug(("Error on " + str(examplesList.getNumExamples()) + " examples is " + str(error))) logging.debug(("Sensitivity (recall = TP/(TP+FN)): " + str(sensitivity))) logging.debug(("Specificity (TN/TN+FP): " + str(specificity))) logging.debug(("Error on positives: " + str(errorP))) logging.debug(("Error on negatives: " + str(errorN))) sGraph = EgoUtils.graphFromMatFile(matFileName) #Notice that the data is preprocessed in the same way as the survey data egoSimulator = EgoSimulator(sGraph, classifier, preprocessor) totalInfo = EgoUtils.getTotalInformation(sGraph) logging.debug(("Total number of people with information: " + str(totalInfo))) self.assertEquals(totalInfo, 1000) sGraph = egoSimulator.advanceGraph() totalInfo = EgoUtils.getTotalInformation(sGraph) logging.debug(("Total number of people with information: " + str(totalInfo))) self.assertEquals(totalInfo, 1000 + sum(predY == 1)) altersList = egoSimulator.getAlters(0) predictedAlters = numpy.nonzero(predY == 1)[0] self.assertTrue((altersList == predictedAlters*2+1).all())