Ejemplo n.º 1
0
    def testReplaceMissingValues(self):
        eCsvReader = EgoCsvReader()
        
        X = numpy.array([[1,2], [5,6], [0, 3]])
        Xnew = eCsvReader.replaceMissingValues(X)

        self.assertTrue((Xnew == numpy.array([[1,2], [5,6], [3, 3]])).all())
Ejemplo n.º 2
0
    def testReplaceMissingValues2(self):
        eCsvReader = EgoCsvReader()

        numpy.random.seed(0)
        X = numpy.array([[1,2], [5,6], [0, 3]])
        Xnew = eCsvReader.replaceMissingValues2(X)
        
        self.assertTrue((Xnew == numpy.array([[1,2], [5,6], [10, 3]])).all())
Ejemplo n.º 3
0
    def testGenerateReceivers(self): 
        eCsvReader = EgoCsvReader()
        
        numAlters = 10
        numPartialFields = 11
        alterFields = numPartialFields + 2
        alterFieldIndices = list(range(0, numPartialFields))
        
        egoAlterArray = numpy.zeros((2, 45))
        alterArray = numpy.zeros((numAlters, alterFields))
        
        egoAlterArray[0, 0:11] = [2, 12, 1, 0, 0, 0, 0, 0, 0, 0, 1]
        egoAlterArray[0, 11:22] = [1, 11, 1, 0, 0, 0, 0, 0, 0, 0, 1]
        egoAlterArray[1, 0:11] = [5, 6, 1, 0, 0, 0, 0, 0, 0, 0, 3]
        
        alterArray[0, :] = [2, 12, 1, 0, 0, 0, 0, 0, 0, 0, 1, 12, 13]
        alterArray[1, :] = [1, 11, 1, 0, 0, 0, 0, 0, 0, 0, 1, 84, 12]
        alterArray[2, :] = [2, 12, 1, 0, 0, 0, 1, 0, 0, 0, 0, 4, 9]
        alterArray[4, :] = [5, 6, 1, 0, 0, 0, 0, 0, 0, 0, 3, 34, 12]
        
        generatedAltersArray2 = numpy.zeros((3, alterFields))
        generatedAltersArray2[0, :] = alterArray[0, :]
        generatedAltersArray2[1, :] = alterArray[1, :]
        generatedAltersArray2[2, :] = alterArray[4, :]
        
        (generatedAltersArray, egoIndices, alterIndices) = eCsvReader.generateReceivers(egoAlterArray, alterArray, alterFieldIndices)

        self.assertTrue((egoIndices == numpy.array([0,0,1])).all())
        self.assertTrue((alterIndices == numpy.array([0,1,4])).all())
        self.assertTrue((generatedAltersArray == generatedAltersArray2).all())
        
        #2nd test 
        egoAlterArray[1, 11:22] = [2, 12, 1, 0, 0, 0, 1, 0, 0, 0, 0]
        
        (generatedAltersArray, egoIndices, alterIndices) = eCsvReader.generateReceivers(egoAlterArray, alterArray, alterFieldIndices)
        
        generatedAltersArray2 = numpy.zeros((4, alterFields))
        generatedAltersArray2[0, :] = alterArray[0, :]
        generatedAltersArray2[1, :] = alterArray[1, :]
        generatedAltersArray2[2, :] = alterArray[4, :]
        generatedAltersArray2[3, :] = alterArray[2, :]
        self.assertTrue((egoIndices == numpy.array([0,0,1,1])).all())
        self.assertTrue((alterIndices == numpy.array([0,1,4,2])).all())
        self.assertTrue((generatedAltersArray == generatedAltersArray2).all())
Ejemplo n.º 4
0
    def testReadFile(self): 
        eCsvReader = EgoCsvReader()
        #logging.debug(os.getcwd())
        dir = PathDefaults.getDataDir()
        fileName = dir + "test/TestData.csv"
        questionIds = [("Q14", 0), ("Q12", 1) , ("Q2", 0)]

        missing = 1
        (X, titles) = eCsvReader.readFile(fileName, questionIds, missing)
        
        X2 = numpy.zeros((10, 3))
        X2[0, :] = [0.621903386,0.608560354,0.33290608]
        X2[1, :] = [0.318548924,0.402390713,0.129956291]
        X2[2, :] = [0.956658404,0.344317772,0.680386616]
        X2[3, :] = [0.267607668,0.119647983,0.116893619]
        X2[4, :] = [0.686589498,0.402390713,0.426789174]
        X2[5, :] = [0.373575769,0.025846789,0.797125005]
        X2[6, :] = [0.493793948,0.402390713,0.990507109]
        X2[7, :] = [0.524534585,0.525169385,0.772917183]
        X2[8, :] = [0.339055395,0.402390713,0.684788001]
        X2[9, :] = [0.997774183,0.790801992,0.643252009]
        
        self.assertAlmostEquals(numpy.linalg.norm(X-X2),0, places=6)
Ejemplo n.º 5
0
def checkDistributions():
    matFileName = "../../data/EgoAlterTransmissions.mat"
    examplesList = ExamplesList.readFromMatFile(matFileName)

    numFeatures = examplesList.getDataFieldSize("X", 1)
    X = examplesList.getDataField("X")[:, 0:numFeatures/2]
    Z = examplesList.getDataField("X")[:, numFeatures/2:numFeatures]
    y = examplesList.getDataField("y")
    A = Z[y==-1, :]

    #Now load directly from the CSV file
    #Learn the distribution of the egos
    eCsvReader = EgoCsvReader()
    egoFileName = "../../data/EgoData.csv"
    alterFileName = "../../data/AlterData.csv"
    egoQuestionIds = eCsvReader.getEgoQuestionIds()
    alterQuestionIds = eCsvReader.getAlterQuestionIds()
    (X2, titles) = eCsvReader.readFile(egoFileName, egoQuestionIds)
    X2[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(X2[:, eCsvReader.ageIndex])

    (mu, sigma) = Util.computeMeanVar(X)
    (mu2, sigma2) = Util.computeMeanVar(X2)
    (mu3, sigma3) = Util.computeMeanVar(Z)
    (mu4, sigma4) = Util.computeMeanVar(A)

    #Seems okay. Next check alters
    print(("Mean " + str(mu - mu4)))
    print(("Variance " + str(numpy.diag(sigma - sigma4))))

    """
    Analysis between the Egos in EgoData.csv and those in EgoAlterTransmissions.mat
    reveals that the distributions match closely. The main differences are
    in the means and variances in Q44A - D, but this isn't too suprising.
    """

    """
Ejemplo n.º 6
0
 def testCsvRowToVector(self):
     eCsvReader = EgoCsvReader()
     
     csvRow = ["1", "5", "2", "12", ""]
     csvRow2 = ["2", "4", "8", "2", "1"]
     csvTitles = ["A", "B", "C", "D", "E"]
     csvErrorTitles1 = ["A", "B", "C", "D", "E" ,"F"]
     csvErrorTitles2 = ["A", "B", "C"]
     
     questionIds = [("B", 0), ("A", 0), ("E", 1)]
     questionIdsError1 = [("B", 0), ("A", 0), ("E", 0)]
     questionIdsError2 = [("B", 0), ("A", 0), ("Z", 1)]
     
     self.assertRaises(ValueError, eCsvReader.csvRowToVector, csvRow, questionIds, csvErrorTitles1)
     self.assertRaises(ValueError, eCsvReader.csvRowToVector, csvRow, questionIds, csvErrorTitles2)
     
     v = eCsvReader.csvRowToVector(csvRow, questionIds, csvTitles)
     self.assertTrue((v==numpy.array([5, 1, 0])).all())
     
     v = eCsvReader.csvRowToVector(csvRow2, questionIds, csvTitles)
     self.assertTrue((v==numpy.array([4, 2, 1])).all())
                     
     self.assertRaises(ValueError, eCsvReader.csvRowToVector, csvRow, questionIdsError1, csvTitles)
     self.assertRaises(ValueError, eCsvReader.csvRowToVector, csvRow, questionIdsError2, csvTitles)
Ejemplo n.º 7
0
examplesFileName = SvmInfoExperiment.getExamplesFileName()
sampleSize = 86755

svmEgoSimulator = SvmEgoSimulator(examplesFileName)
preprocessor = svmEgoSimulator.getPreProcessor()
centerValues = preprocessor.getCentreVector()

svmParamsFileName = SvmInfoExperiment.getSvmParamsFileName() + "Linear.mat"
logging.info("Using SVM params from file " + svmParamsFileName)

C, kernel, kernelParamVal, errorCost = SvmInfoExperiment.loadSvmParams(svmParamsFileName)
svmEgoSimulator.trainClassifier(C, kernel, kernelParamVal, errorCost, sampleSize)

weights, b  = svmEgoSimulator.getWeights()

numpy.set_printoptions(precision=3)

#Print the weights then their sorted values by indices and then value
sortedWeightsInds = numpy.flipud(numpy.argsort(abs(weights)))
sortedWeights = numpy.flipud(weights[numpy.argsort(abs(weights))])

egoCsvReader = EgoCsvReader()
questionIds = egoCsvReader.getEgoQuestionIds()
questionIds.extend(egoCsvReader.getAlterQuestionIds())

print(weights)
numRankedItems = 20

for i in range(0,numRankedItems):
    print((str(centerValues[sortedWeightsInds[i]]) + " & " + questionIds[sortedWeightsInds[i]][0] + " & " + str("%.3f" % sortedWeights[i]) + "\\\\"))
print(b)
Ejemplo n.º 8
0
#A second set of parameters 
p = float(30)/numVertices
generator = ErdosRenyiGenerator(p)

simulationRepetitions = 5 
maxIterations = 3
sampleSize = SvmInfoExperiment.getNumSimulationExamples()
#sampleSize = 1000
svmParamsFile = SvmInfoExperiment.getSvmParamsFileName()
CVal, kernel, kernelParamVal, errorCost = SvmInfoExperiment.loadSvmParams(svmParamsFile)

simulator = SvmEgoSimulator(examplesFileName)
simulator.trainClassifier(CVal, kernel, kernelParamVal, errorCost, sampleSize)

egoCsvReader = EgoCsvReader()

genderIndex = egoCsvReader.genderIndex
ageIndex = egoCsvReader.ageIndex
incomeIndex = egoCsvReader.incomeIndex
townSizeIndex = egoCsvReader.townSizeIndex
foodRiskIndex = egoCsvReader.foodRiskIndex
experienceIndex = egoCsvReader.experienceIndex
internetFreqIndex = egoCsvReader.internetFreqIndex
peopleAtWorkIndex = egoCsvReader.peopleAtWorkIndex
educationIndex = egoCsvReader.educationIndex

professionIndices = numpy.zeros(egoCsvReader.numProfessions)

for i in range(1, egoCsvReader.numProfessions+1):
    egoQuestionIds = egoCsvReader.getEgoQuestionIds()
Ejemplo n.º 9
0
    """

if __name__ == "__main__":
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    numpy.set_printoptions(precision=4, suppress=True)
    
    #checkDistributions()

    """
    We will read ego and alters data, and check they have the same values.
    """

    egoFileName = "../../data/EgoData.csv"
    alterFileName = "../../data/AlterData.csv"

    eCsvReader = EgoCsvReader()
    egoQuestionIds = eCsvReader.getEgoQuestionIds()
    alterQuestionIds = eCsvReader.getAlterQuestionIds()

    missing = 0 
    (egoX, titles) = eCsvReader.readFile(egoFileName, egoQuestionIds, missing)
    egoX[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(egoX[:, eCsvReader.ageIndex])

    (alterX, titles) = eCsvReader.readFile(alterFileName, alterQuestionIds, missing)
    alterX[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(alterX[:, eCsvReader.ageIndex])

    numFeatures = egoX.shape[1]
    numEgoExamples = egoX.shape[0]
    numAlterExamples = alterX.shape[0]

    for i in range(0, numFeatures):
Ejemplo n.º 10
0
    def testReadFiles(self):
        p = 0.5
        eCsvReader = EgoCsvReader()
        eCsvReader.setP(p)

        dataDir = PathDefaults.getDataDir() + "infoDiffusion/"
        egoFileName = dataDir + "EgoData3.csv"
        alterFileName = dataDir + "AlterData10.csv"
        examplesList, egoIndicesR, alterIndices, egoIndicesNR, alterIndicesNR  = eCsvReader.readFiles(egoFileName, alterFileName)
        #logging.debug(examplesList.getDataField("X"))
        
        #Read in the ego and alter arrays 
        (egoArray, _) = eCsvReader.readFile(egoFileName, eCsvReader.getEgoQuestionIds())
        (alterArray, _) = eCsvReader.readFile(alterFileName, eCsvReader.getAlterQuestionIds())
        
        #Make up the correct results 
        numFeatures = examplesList.getDataFieldSize("X", 1)
        numPersonFeatures = numFeatures/2 

        #Note: no alters in this case 
        numTransmissons = 6
        X2 = numpy.zeros((numTransmissons, numFeatures))
        y2 = numpy.zeros((numTransmissons, 1))
        
        X2[0, 0:numPersonFeatures] = egoArray[0, :]
        X2[0, numPersonFeatures:numFeatures] = egoArray[1, :]
        y2[0, 0] = -1
        
        X2[1, 0:numPersonFeatures] = egoArray[0, :]
        X2[1, numPersonFeatures:numFeatures] = egoArray[2, :]
        y2[1, 0] = -1
        
        X2[2, 0:numPersonFeatures] = egoArray[1, :]
        X2[2, numPersonFeatures:numFeatures] = egoArray[0, :]
        y2[2, 0] = -1
        
        X2[3, 0:numPersonFeatures] = egoArray[1, :]
        X2[3, numPersonFeatures:numFeatures] = egoArray[2, :]
        y2[3, 0] = -1
        
        X2[4, 0:numPersonFeatures] = egoArray[2, :]
        X2[4, numPersonFeatures:numFeatures] = egoArray[0, :]
        y2[4, 0] = -1
        
        X2[5, 0:numPersonFeatures] = egoArray[2, :]
        X2[5, numPersonFeatures:numFeatures] = egoArray[1, :]
        y2[5, 0] = -1

        self.assertTrue((X2 == examplesList.getDataField("X")).all())
        self.assertTrue((y2 == examplesList.getDataField("y")).all())



        #Second test
        #================
        #I modified EgoData3 so that person 2 is the same age as person 1, and
        # hence a homophile of 1. She (2) is excluded from the non-receivers, since
        #she is a homophile of person 1.

        p = 0
        eCsvReader = EgoCsvReader()
        eCsvReader.setP(p)

        examplesList, egoIndicesR, alterIndices, egoIndicesNR, alterIndicesNR  = eCsvReader.readFiles(egoFileName, alterFileName)

        numTransmissons = 5
        X2 = numpy.zeros((numTransmissons, numFeatures))
        y2 = numpy.zeros((numTransmissons, 1))

        X2[0, 0:numPersonFeatures] = egoArray[0, :]
        X2[0, numPersonFeatures:numFeatures] = egoArray[2, :]
        y2[0, 0] = -1

        X2[1, 0:numPersonFeatures] = egoArray[1, :]
        X2[1, numPersonFeatures:numFeatures] = egoArray[0, :]
        y2[1, 0] = -1

        X2[2, 0:numPersonFeatures] = egoArray[1, :]
        X2[2, numPersonFeatures:numFeatures] = egoArray[2, :]
        y2[2, 0] = -1

        X2[3, 0:numPersonFeatures] = egoArray[2, :]
        X2[3, numPersonFeatures:numFeatures] = egoArray[0, :]
        y2[3, 0] = -1

        X2[4, 0:numPersonFeatures] = egoArray[2, :]
        X2[4, numPersonFeatures:numFeatures] = egoArray[1, :]
        y2[4, 0] = -1

        self.assertTrue((X2 == examplesList.getDataField("X")).all())
        self.assertTrue((y2 == examplesList.getDataField("y")).all())
Ejemplo n.º 11
0
    def testGenerateNonReceivers(self):
        numEgos = 3
        numFeatures = 5

        p = 1
        eCsvReader = EgoCsvReader()
        eCsvReader.setP(p)

        numContactsIndices = [0, 1]
        homophileIndexPairs = [(2,3)]

        receiverCounts = numpy.zeros(numEgos)

        #First test a very simple example with 1 homophile pair
        egoArray = numpy.zeros((numEgos, numFeatures))
        egoArray[0, :] = [0, 1, 1, 5, 4]
        egoArray[1, :] = [0, 1, 1, 5, 8]
        egoArray[2, :] = [0, 0, 1, 3, 6]

        (contactsArray, egoIndices, alterIndices) = eCsvReader.generateNonReceivers(egoArray, numContactsIndices, homophileIndexPairs, receiverCounts)

        numContacts = 4
        contactsArray2 = numpy.zeros((numContacts, numFeatures))
        contactsArray2[0, :] = [0, 1, 1, 5, 8]
        contactsArray2[1, :] = [0, 0, 1, 3, 6]
        contactsArray2[2, :] = [0, 1, 1, 5, 4]
        contactsArray2[3, :] = [0, 0, 1, 3, 6]


        egoIndices2 = numpy.zeros(numContacts)
        egoIndices2[0] = 0
        egoIndices2[1] = 0
        egoIndices2[2] = 1
        egoIndices2[3] = 1

        self.assertTrue((contactsArray == contactsArray2).all())
        self.assertTrue((egoIndices == egoIndices2).all())

        #Test the case when there are some receivers already
        receiverCounts = numpy.array([1,1,1])
        (contactsArray, egoIndices, alterIndices) = eCsvReader.generateNonReceivers(egoArray, numContactsIndices, homophileIndexPairs, receiverCounts)


        numContacts = 2
        contactsArray2 = numpy.zeros((numContacts, numFeatures))
        contactsArray2[0, :] = [0, 1, 1, 5, 8]
        contactsArray2[1, :] = [0, 1, 1, 5, 4]


        egoIndices2 = numpy.zeros(numContacts)
        egoIndices2[0] = 0
        egoIndices2[1] = 1

        self.assertTrue((contactsArray == contactsArray2).all())
        self.assertTrue((egoIndices == egoIndices2).all())

        #A more complex example
        numEgos = 6
        egoArray = numpy.zeros((numEgos, numFeatures))
        egoArray[0, :] = [1, 1, 1, 5, 4]
        egoArray[1, :] = [0, 0, 1, 5, 8]
        egoArray[2, :] = [0, 0, 1, 3, 6]
        egoArray[3, :] = [0, 0, 1, 5, 1]
        egoArray[4, :] = [0, 0, 1, 5, 2]
        egoArray[5, :] = [0, 0, 1, 5, 3]

        receiverCounts = numpy.zeros(numEgos)
        (contactsArray, egoIndices, alterIndices) = eCsvReader.generateNonReceivers(egoArray, numContactsIndices, homophileIndexPairs, receiverCounts)

        numContacts = 4
        contactsArray2 = numpy.zeros((numContacts, numFeatures))
        contactsArray2[0, :] = [0, 0, 1, 5, 8]
        contactsArray2[1, :] = [0, 0, 1, 5, 1]
        contactsArray2[2, :] = [0, 0, 1, 5, 2]
        contactsArray2[3, :] = [0, 0, 1, 5, 3]

        egoIndices2 = numpy.zeros(numContacts)
        egoIndices2[0] = 0
        egoIndices2[1] = 0
        egoIndices2[2] = 0
        egoIndices2[3] = 0


        self.assertTrue((contactsArray == contactsArray2).all())
        self.assertTrue((egoIndices == egoIndices2).all())

        #Test picking non-homophiles
        egoArray[0, :] = [2, 1, 2, 5, 4]

        (contactsArray, egoIndices, alterIndices) = eCsvReader.generateNonReceivers(egoArray, numContactsIndices, homophileIndexPairs, receiverCounts)

        numContacts = 5
        contactsArray2 = numpy.zeros((numContacts, numFeatures))
        contactsArray2[0, :] = [0, 0, 1, 5, 8]
        contactsArray2[1, :] = [0, 0, 1, 3, 6]
        contactsArray2[2, :] = [0, 0, 1, 5, 1]
        contactsArray2[3, :] = [0, 0, 1, 5, 2]
        contactsArray2[4, :] = [0, 0, 1, 5, 3]

        egoIndices2 = numpy.zeros(numContacts)
        egoIndices2[0] = 0
        egoIndices2[1] = 0
        egoIndices2[2] = 0
        egoIndices2[3] = 0
        egoIndices2[4] = 0


        self.assertTrue((contactsArray == contactsArray2).all())
        self.assertTrue((egoIndices == egoIndices2).all())

        #Choose different p 
        p = 0.5
        eCsvReader.setP(p)

        egoArray[0, :] = [1, 1, 1, 5, 4]
        egoArray[1, :] = [0, 0, 1, 5, 8]
        egoArray[2, :] = [0, 0, 1, 3, 6]
        egoArray[3, :] = [0, 0, 1, 5, 8]
        egoArray[4, :] = [0, 0, 1, 5, 8]
        egoArray[5, :] = [0, 0, 1, 5, 8]

        (contactsArray, egoIndices, alterIndices) = eCsvReader.generateNonReceivers(egoArray, numContactsIndices, homophileIndexPairs, receiverCounts)

        numContacts = 3
        contactsArray2 = numpy.zeros((numContacts, numFeatures))
        contactsArray2[0, :] = [0, 0, 1, 5, 8]
        contactsArray2[1, :] = [0, 0, 1, 5, 8]
        contactsArray2[2, :] = [0, 0, 1, 3, 6]


        egoIndices2 = numpy.zeros(numContacts)
        egoIndices2[0] = 0
        egoIndices2[1] = 0
        egoIndices2[2] = 0

        self.assertTrue((contactsArray == contactsArray2).all())
        self.assertTrue((egoIndices == egoIndices2).all())

        #Test 2 different homophile fields
        p = 1
        eCsvReader.setP(p)
        numEgos = 6
        numFeatures = 7

        homophileIndexPairs = [(2,3), (4,5)]

        egoArray = numpy.zeros((numEgos, numFeatures))
        egoArray[0, :] = [0, 0, 1, 5, 1, 2, 1]
        egoArray[1, :] = [0, 0, 1, 5, 1, 3, 2]
        egoArray[2, :] = [0, 0, 1, 4, 1, 2, 3]
        egoArray[3, :] = [1, 0, 1, 5, 1, 2, 4]
        egoArray[4, :] = [0, 0, 1, 2, 1, 1, 5]
        egoArray[5, :] = [0, 0, 1, 5, 1, 2, 6]

        (contactsArray, egoIndices, alterIndices) = eCsvReader.generateNonReceivers(egoArray, numContactsIndices, homophileIndexPairs, receiverCounts)

        numContacts = 2
        contactsArray2 = numpy.zeros((numContacts, numFeatures))
        contactsArray2[0, :] = [0, 0, 1, 5, 1, 2, 1]
        contactsArray2[1, :] = [0, 0, 1, 5, 1, 2, 6]

        egoIndices2 = numpy.zeros(numContacts)
        egoIndices2[0] = 3
        egoIndices2[1] = 3

        self.assertTrue((contactsArray == contactsArray2).all())
        self.assertTrue((egoIndices == egoIndices2).all())
Ejemplo n.º 12
0
    def testInit(self):
        eCsv = EgoCsvReader()

        self.assertEquals(len(eCsv.getEgoQuestionIds()), 62)
        self.assertEquals(len(eCsv.getAlterQuestionIds()), 62)