Exemple #1
0
    def __init__(self, field):
        numpy.random.seed(21)        
        
        dataDir = PathDefaults.getDataDir() + "dblp/"
        self.xmlFileName = dataDir + "dblp.xml"
        self.xmlCleanFilename = dataDir + "dblpClean.xml"        

        resultsDir = PathDefaults.getDataDir() + "reputation/" + field + "/"
        self.expertsFileName = resultsDir + "experts.txt"
        self.expertMatchesFilename = resultsDir + "experts_matches.csv"
        self.trainExpertMatchesFilename = resultsDir + "experts_train_matches.csv"
        self.testExpertMatchesFilename = resultsDir + "experts_test_matches.csv"
        self.coauthorsFilename = resultsDir + "coauthors.csv"
        self.publicationsFilename = resultsDir + "publications.csv"
        
        self.stepSize = 100000
        self.numLines = 33532888
        self.publicationTypes = set(["article" , "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"])
        self.p = 0.5     
        self.matchCutoff = 0.95
        
        
        self.cleanXML()
        self.matchExperts()
        logging.warning("Now you must disambiguate the matched experts if not ready done")        
def processSimpleDataset(name, numRealisations, split, ext=".csv", delimiter=",", usecols=None, skiprows=1, converters=None):
    numpy.random.seed(21)
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/"
    fileName = dataDir + name + ext
    
    print("Loading data from file " + fileName)
    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "/"

    XY = numpy.loadtxt(fileName, delimiter=delimiter, skiprows=skiprows, usecols=usecols, converters=converters)
    X = XY[:, :-1]
    y = XY[:, -1]
    idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split)
    preprocessSave(X, y, outputDir, idx)
    def testGenerateRandomGraph(self):
        egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv"
        alterFileName = PathDefaults.getDataDir()  + "infoDiffusion/AlterData.csv"
        numVertices = 1000
        infoProb = 0.1

        
        p = 0.1
        neighbours = 10
        generator = SmallWorldGenerator(p, neighbours)
        graph = SparseGraph(VertexList(numVertices, 0))
        graph = generator.generate(graph)

        self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph)
 def __init__(self):
     self.labelNames = ["Cortisol.val", "Testosterone.val", "IGF1.val"]
     self.dataDir = PathDefaults.getDataDir() +  "metabolomic/"
     self.boundsDict = {}
     self.boundsDict["Cortisol"] = numpy.array([0, 89, 225, 573])
     self.boundsDict["Testosterone"] = numpy.array([0, 3, 9, 13])
     self.boundsDict["IGF1"] = numpy.array([0, 200, 441, 782])
    def testComputeIdealPenalty(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]

        sampleSize = 100
        trainX, trainY = X[0:sampleSize, :], y[0:sampleSize]
        testX, testY = X[sampleSize:, :], y[sampleSize:]

        #We form a test set from the grid points
        fullX = numpy.zeros((gridPoints.shape[0]**2, 2))
        for m in range(gridPoints.shape[0]):
            fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints
            fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m]

        C = 1.0
        gamma = 1.0
        args = (trainX, trainY, fullX, C, gamma, gridPoints, pdfX, pdfY1X, pdfYminus1X)
        penalty = computeIdealPenalty(args)


        #Now compute penalty using data
        args = (trainX, trainY, testX, testY, C, gamma)
        penalty2 = computeIdealPenalty2(args)

        self.assertAlmostEquals(penalty2, penalty, 2)
Exemple #6
0
 def main(argv=None):
     if argv is None:
         argv = sys.argv
     try:
         # read options
         try:
             opts, args = getopt.getopt(argv[1:], "hd:n:D", ["help", "dir=", "nb_user="******"debug"])
         except getopt.error as msg:
              raise RGUsage(msg)
         # apply options
         dir = PathDefaults.getDataDir() + "cluster/"
         nb_user = None
         log_level = logging.INFO
         for o, a in opts:
             if o in ("-h", "--help"):
                 print(__doc__)
                 return 0
             elif o in ("-d", "--dir"):
                 dir = a
             elif o in ("-n", "--nb_user"):
                 nb_user = int(a)
             elif o in ("-D", "--debug"):
                 log_level = logging.DEBUG
         logging.basicConfig(stream=sys.stdout, level=log_level, format='%(levelname)s (%(asctime)s):%(message)s')
         # process: generate data files
         BemolData.generate_data_file(dir, nb_user)
     except RGUsage as err:
         logging.error(err.msg)
         logging.error("for help use --help")
         return 2
    def testToyData(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]


        pxSum = 0
        pY1XSum = 0
        pYminus1XSum = 0

        px2Sum = 0 
        squareArea = (gridPoints[1]-gridPoints[0])**2

        for i in range(gridPoints.shape[0]-1):
            for j in range(gridPoints.shape[0]-1):
                px = (pdfX[i,j]+pdfX[i+1,j]+pdfX[i, j+1]+pdfX[i+1, j+1])/4
                pxSum += px*squareArea

                pY1X = (pdfY1X[i,j]+pdfY1X[i+1,j]+pdfY1X[i, j+1]+pdfY1X[i+1, j+1])/4
                pY1XSum += pY1X*squareArea

                pYminus1X = (pdfYminus1X[i,j]+pdfYminus1X[i+1,j]+pdfYminus1X[i, j+1]+pdfYminus1X[i+1, j+1])/4
                pYminus1XSum += pYminus1X*squareArea

                px2Sum += px*pY1X*squareArea + px*pYminus1X*squareArea

        self.assertAlmostEquals(pxSum, 1)
        print(pY1XSum)
        print(pYminus1XSum)

        self.assertAlmostEquals(px2Sum, 1)
Exemple #8
0
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        maxDepths = range(3, 10)
        trainAucs = numpy.array([0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508])
        testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400])
        i = 0
        
        #The results are approximately the same, but not exactly 
        for maxDepth in maxDepths:
            treeRank = TreeRank(self.leafRanklearner)
            treeRank.setMaxDepth(maxDepth)
            treeRank.learnModel(X, y)
            trainScores = treeRank.predict(X)
            testScores = treeRank.predict(testX)

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i+=1 
    def testEdgeFile(self):
        """
        Figure out the problem with the edge file 
        """
        dataDir = PathDefaults.getDataDir() + "cluster/"
        edgesFilename = dataDir + "Cit-HepTh.txt"

        edges = {}
        file = open(edgesFilename, 'r')
        file.readline()
        file.readline()
        file.readline()
        file.readline()

        vertices = {}

        for line in file:
            (vertex1, sep, vertex2) = line.partition("\t")
            vertex1 = vertex1.strip()
            vertex2 = vertex2.strip()
            edges[(vertex1, vertex2)] = 0
            vertices[vertex1] = 0
            vertices[vertex2] = 0

        #It says there are 352807 edges in paper and 27770 vertices
        self.assertEquals(len(edges), 352807)
        self.assertEquals(len(vertices), 27770)
Exemple #10
0
 def testGraphFromMatFile(self):
     matFileName = PathDefaults.getDataDir() +  "infoDiffusion/EgoAlterTransmissions1000.mat"
     sGraph = EgoUtils.graphFromMatFile(matFileName)
     
     examplesList = ExamplesList.readFromMatFile(matFileName)
     numFeatures = examplesList.getDataFieldSize("X", 1)
     
     self.assertEquals(examplesList.getNumExamples(), sGraph.getNumEdges())
     self.assertEquals(examplesList.getNumExamples()*2, sGraph.getNumVertices())
     self.assertEquals(numFeatures/2+1, sGraph.getVertexList().getNumFeatures())
     
     #Every even vertex has information, odd does not 
     for i in range(0, sGraph.getNumVertices()): 
         vertex = sGraph.getVertex(i)
         
         if i%2 == 0: 
             self.assertEquals(vertex[sGraph.getVertexList().getNumFeatures()-1], 1)
         else: 
             self.assertEquals(vertex[sGraph.getVertexList().getNumFeatures()-1], 0)
             
     #Test the first few vertices are the same 
     for i in range(0, 10): 
         vertex1 = sGraph.getVertex(i*2)[0:numFeatures/2]
         vertex2 = sGraph.getVertex(i*2+1)[0:numFeatures/2]
         vertexEx1 = examplesList.getSubDataField("X", numpy.array([i])).ravel()[0:numFeatures/2]
         vertexEx2 = examplesList.getSubDataField("X", numpy.array([i])).ravel()[numFeatures/2:numFeatures]
         
         self.assertTrue((vertex1 == vertexEx1).all())
         self.assertTrue((vertex2 == vertexEx2).all())
    def __init__(self, maxIter=None, iterStartTimeStamp=None): 
        outputDir = PathDefaults.getOutputDir() + "recommend/erasm/"

        if not os.path.exists(outputDir): 
            os.mkdir(outputDir)
            
        #iterStartDate is the starting date of the iterator 
        if iterStartTimeStamp != None: 
            self.iterStartTimeStamp = iterStartTimeStamp
        else: 
            self.iterStartTimeStamp = 1286229600
            
        self.timeStep = timedelta(30).total_seconds()             
                
        self.ratingFileName = outputDir + "data.npz"          
        self.userDictFileName = outputDir + "userIdDict.pkl"   
        self.groupDictFileName = outputDir + "groupIdDict.pkl" 
        self.isTrainRatingsFileName = outputDir + "is_train.npz"
    
        self.dataDir = PathDefaults.getDataDir() + "erasm/"
        self.dataFileName = self.dataDir + "groupMembers-29-11-12" 
        
        self.maxIter = maxIter 
        self.trainSplit = 4.0/5 
        
        self.processRatings()
        self.splitDataset()        
        self.loadProcessedData()
Exemple #12
0
    def processRatings(self): 
        """
        Convert the dataset into a matrix and save the results for faster 
        access. 
        """
        if not os.path.exists(self.ratingFileName) or not os.path.exists(self.custDictFileName): 
            dataDir = PathDefaults.getDataDir() + "netflix/training_set/"

            logging.debug("Processing ratings given in " + dataDir)

            custIdDict = {} 
            custIdSet = set([])        
            
            movieIds = array.array("I")
            custIds = array.array("I")
            ratings = array.array("B")
            dates = array.array("L")
            j = 0
            
            for i in range(self.startMovieID, self.endMovieID+1): 
                Util.printIteration(i-1, 1, self.endMovieID-1)
                ratingsFile = open(dataDir + "mv_" + str(i).zfill(7) + ".txt")
                ratingsFile.readline()
                
                for line in ratingsFile: 
                    vals = line.split(",")
                    
                    custId = int(vals[0])
                    
                    if custId not in custIdSet: 
                        custIdSet.add(custId)
                        custIdDict[custId] = j
                        custInd = j 
                        j += 1 
                    else: 
                        custInd = custIdDict[custId]
                    
                    rating = int(vals[1])     
                    t = datetime.strptime(vals[2].strip(), "%Y-%m-%d")
                
                    movieIds.append(i-1)
                    custIds.append(custInd)   
                    ratings.append(rating)
                    dates.append(int(time.mktime(t.timetuple()))) 
                    
            movieIds = numpy.array(movieIds, numpy.uint32)
            custIds = numpy.array(custIds, numpy.uint32)
            ratings = numpy.array(ratings, numpy.uint8)
            dates = numpy.array(dates, numpy.uint32)
            
            assert ratings.shape[0] == self.numRatings            
            
            numpy.savez(self.ratingFileName, movieIds, custIds, ratings, dates) 
            logging.debug("Saved ratings file as " + self.ratingFileName)
            
            pickle.dump(custIdDict, open(self.custDictFileName, 'wb'))
            logging.debug("Saved custIdDict as " + self.custDictFileName)
        else: 
            logging.debug("Ratings file " + str(self.ratingFileName) + " already processed")
    def loadData():
        """
        Return the raw spectra and the MDS transformed data as well as the DataFrame
        for the MDS data. 
        """
        utilsLib = importr('utils')

        dataDir = PathDefaults.getDataDir() +  "metabolomic/"
        fileName = dataDir + "data.RMN.total.6.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        maxNMRIndex = 951
        X = df.rx(robjects.IntVector(range(1, maxNMRIndex)))
        X = numpy.array(X).T

        #Load age and normalise (missing values are assinged the mean) 
        ages = numpy.array(df.rx(robjects.StrVector(["Age"]))).ravel()
        meanAge = numpy.mean(ages[numpy.logical_not(numpy.isnan(ages))])
        ages[numpy.isnan(ages)] = meanAge
        ages = Standardiser().standardiseArray(ages)

        Xs = X.copy()
        standardiser = Standardiser()
        Xs = standardiser.standardiseArray(X)

        fileName = dataDir + "data.sportsmen.log.AP.1.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        maxNMRIndex = 419
        X2 = df.rx(robjects.IntVector(range(1, maxNMRIndex)))
        X2 = numpy.array(X2).T

        #Load the OPLS corrected files
        fileName = dataDir + "IGF1.log.OSC.1.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        minNMRIndex = 22
        maxNMRIndex = 441
        Xopls1 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex)))
        Xopls1 = numpy.array(Xopls1).T

        fileName = dataDir + "cort.log.OSC.1.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        minNMRIndex = 20
        maxNMRIndex = 439
        Xopls2 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex)))
        Xopls2 = numpy.array(Xopls2).T

        fileName = dataDir + "testo.log.OSC.1.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        minNMRIndex = 22
        maxNMRIndex = 441
        Xopls3 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex)))
        Xopls3 = numpy.array(Xopls3).T

        #Let's load all the label data here
        labelNames = MetabolomicsUtils.getLabelNames()
        YList = MetabolomicsUtils.createLabelList(df, labelNames)
        
        return X, X2, Xs, (Xopls1, Xopls2, Xopls3), YList, ages, df
Exemple #14
0
 def getIterator(): 
     dataDir = PathDefaults.getDataDir() + "cluster/"
     
     nbUser = 10000 # set to 'None' to have all users
     nbPurchasesPerIt = 500 # set to 'None' to take all the purchases per date
     startingIteration = 300
     endingIteration = 600 # set to 'None' to have all iterations
     stepSize = 1    
     
     return itertools.islice(BemolData.getGraphIterator(dataDir, nbUser, nbPurchasesPerIt), startingIteration, endingIteration, stepSize)
 def profileClusterFromIterator(self):
     iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList)
     dataDir = PathDefaults.getDataDir() + "cluster/"
     #iterator = getBemolGraphIterator(dataDir)
     
     def run(): 
         clusterList, timeList, boundList = self.clusterer.clusterFromIterator(iterator, verbose=True)
         print(timeList.cumsum(0))
         
     ProfileUtils.profile('run()', globals(), locals())
Exemple #16
0
    def testReadGraph(self):
        fileName = PathDefaults.getDataDir() + "test/simpleGraph.txt"

        graphReader = SimpleGraphReader()
        graph = graphReader.readFromFile(fileName)

        logging.debug((graph.getAllEdges()))

        self.assertEquals(graph.isUndirected(), True)
        self.assertEquals(graph.getNumVertices(), 5)
        self.assertEquals(graph.getNumEdges(), 4)

        self.assertEquals(graph.getEdge(0, 1), 1)
        self.assertEquals(graph.getEdge(2, 4), 1)
        self.assertEquals(graph.getEdge(2, 2), 1)
        self.assertEquals(graph.getEdge(4, 0), 1)

        #Now test reading a file with the same graph but vertices indexed differently
        fileName = PathDefaults.getDataDir() + "test/simpleGraph2.txt"
        graph = graphReader.readFromFile(fileName)

        self.assertEquals(graph.isUndirected(), True)
        self.assertEquals(graph.getNumVertices(), 5)
        self.assertEquals(graph.getNumEdges(), 4)

        self.assertEquals(graph.getEdge(0, 1), 1.1)
        self.assertEquals(graph.getEdge(2, 4), 1)
        self.assertEquals(graph.getEdge(2, 2), 1.6)
        self.assertEquals(graph.getEdge(4, 0), 1)

        #Now test a file with directed edges
        fileName = PathDefaults.getDataDir() + "test/simpleGraph3.txt"
        graph = graphReader.readFromFile(fileName)

        self.assertEquals(graph.isUndirected(), False)
        self.assertEquals(graph.getNumVertices(), 5)
        self.assertEquals(graph.getNumEdges(), 4)

        self.assertEquals(graph.getEdge(0, 1), 1)
        self.assertEquals(graph.getEdge(2, 4), 1)
        self.assertEquals(graph.getEdge(2, 2), 1)
        self.assertEquals(graph.getEdge(4, 0), 1)
    def testReadGraph(self):
        fileName = PathDefaults.getDataDir() +  "test/simpleGraph.txt"

        graphReader = SimpleGraphReader()
        graph = graphReader.readFromFile(fileName)

        logging.debug((graph.getAllEdges()))

        self.assertEquals(graph.isUndirected(), True)
        self.assertEquals(graph.getNumVertices(), 5)
        self.assertEquals(graph.getNumEdges(), 4)

        self.assertEquals(graph.getEdge(0, 1), 1)
        self.assertEquals(graph.getEdge(2, 4), 1)
        self.assertEquals(graph.getEdge(2, 2), 1)
        self.assertEquals(graph.getEdge(4, 0), 1)

        #Now test reading a file with the same graph but vertices indexed differently
        fileName = PathDefaults.getDataDir() + "test/simpleGraph2.txt"
        graph = graphReader.readFromFile(fileName)

        self.assertEquals(graph.isUndirected(), True)
        self.assertEquals(graph.getNumVertices(), 5)
        self.assertEquals(graph.getNumEdges(), 4)

        self.assertEquals(graph.getEdge(0, 1), 1.1)
        self.assertEquals(graph.getEdge(2, 4), 1)
        self.assertEquals(graph.getEdge(2, 2), 1.6)
        self.assertEquals(graph.getEdge(4, 0), 1)

        #Now test a file with directed edges
        fileName = PathDefaults.getDataDir() +  "test/simpleGraph3.txt"
        graph = graphReader.readFromFile(fileName)

        self.assertEquals(graph.isUndirected(), False)
        self.assertEquals(graph.getNumVertices(), 5)
        self.assertEquals(graph.getNumEdges(), 4)

        self.assertEquals(graph.getEdge(0, 1), 1)
        self.assertEquals(graph.getEdge(2, 4), 1)
        self.assertEquals(graph.getEdge(2, 2), 1)
        self.assertEquals(graph.getEdge(4, 0), 1)
 def __init__(self):
     dataDir = PathDefaults.getDataDir() + "cluster/"
     nbUser = 2000 # set to 'None' to have all users
     nbPurchasesPerIt = 50 # set to 'None' to take all the purchases
                                           # per date
     startingIteration = 20
     endingIteration = None # set to 'None' to have all iterations
     stepSize = 10    
     
     iterator = itertools.islice(BemolData.getGraphIterator(dataDir, nbUser, nbPurchasesPerIt), startingIteration, endingIteration, stepSize)
     self.iterator = iterator 
    def testGetTrainIteratorFunc(self):
        dataFilename = PathDefaults.getDataDir() + "reference/author_document_count" 
        dataset = Static2IdValDataset(dataFilename)

        trainIterator = dataset.getTrainIteratorFunc()()      
        testIterator = dataset.getTestIteratorFunc()()
        
        for trainX in trainIterator: 
            testX = testIterator.next() 
            
            print(trainX.shape, trainX.nnz, testX.nnz)
            self.assertEquals(trainX.shape, testX.shape)
    def testCreateIndicatorLabels(self):
        metaUtils = MetabolomicsUtils()
        X, XStd, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData()
        
        YCortisol = YCortisol[numpy.logical_not(numpy.isnan(YCortisol))]
        YCortisolIndicators = metaUtils.createIndicatorLabel(YCortisol, metaUtils.boundsDict["Cortisol"])
        
        YTesto = YTesto[numpy.logical_not(numpy.isnan(YTesto))]
        YTestoIndicators = metaUtils.createIndicatorLabel(YTesto, metaUtils.boundsDict["Testosterone"])
        
        YIgf1 = YIgf1[numpy.logical_not(numpy.isnan(YIgf1))]
        YIgf1Indicators = metaUtils.createIndicatorLabel(YIgf1, metaUtils.boundsDict["IGF1"])

        s = numpy.sum(YCortisolIndicators, 1)
        nptst.assert_array_equal(s, numpy.ones(s.shape[0]))

        s = numpy.sum(YTestoIndicators, 1)
        nptst.assert_array_equal(s, numpy.ones(s.shape[0]))

        s = numpy.sum(YIgf1Indicators, 1)
        nptst.assert_array_equal(s, numpy.ones(s.shape[0]))

        #Now compare to those labels in the file
        X, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData()
        dataDir = PathDefaults.getDataDir() +  "metabolomic/"
        fileName = dataDir + "data.RMN.total.6.txt"
        data = pandas.read_csv(fileName, delimiter=",") 

        YCortisolIndicators = metaUtils.createIndicatorLabel(YCortisol, metaUtils.boundsDict["Cortisol"])
        YCortisolIndicators2 = numpy.array(data[["Ind.Cortisol.1", "Ind.Cortisol.2", "Ind.Cortisol.3"]])
        
        for i in range(YCortisolIndicators.shape[0]): 
            if not numpy.isnan(YCortisol[i]) and not numpy.isnan(YCortisolIndicators2[i, :]).any(): 
                #nptst.assert_almost_equal(YCortisolIndicators2[i, :], YCortisolIndicators[i, :])
                pass 
        
        YTestoIndicators = metaUtils.createIndicatorLabel(YTesto, metaUtils.boundsDict["Testosterone"])
        YTestoIndicators2 = numpy.array(data[["Ind.Testo.1", "Ind.Testo.2", "Ind.Testo.3"]])
        
        for i in range(YTestoIndicators.shape[0]): 
            if not numpy.isnan(YTesto[i]) and not numpy.isnan(YTestoIndicators2[i, :]).any(): 
                #print(i, YTesto[i])
                nptst.assert_almost_equal(YTestoIndicators2[i, :], YTestoIndicators[i, :])
                
        YIgf1Indicators = metaUtils.createIndicatorLabel(YIgf1, metaUtils.boundsDict["IGF1"])
        YIgf1Indicators2 = numpy.array(data[["Ind.IGF1.1", "Ind.IGF1.2", "Ind.IGF1.3"]])
        
        for i in range(YIgf1Indicators.shape[0]): 
            if not numpy.isnan(YIgf1[i]) and not numpy.isnan(YIgf1Indicators2[i, :]).any(): 
                #print(i, YIgf1[i])
                #nptst.assert_almost_equal(YIgf1Indicators2[i, :], YIgf1Indicators[i, :])
                pass
def processParkinsonsDataset(name, numRealisations):
    numpy.random.seed(21)
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/"
    fileName = dataDir + name + ".data"
    

    XY = numpy.loadtxt(fileName, delimiter=",", skiprows=1)
    inds = list(set(range(XY.shape[1])) - set([5, 6]))
    X = XY[:, inds]

    y1 = XY[:, 5]
    y2 = XY[:, 6]
    #We don't keep whole collections of patients
    split = 0.5

    idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split)

    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-motor/"
    preprocessSave(X, y1, outputDir, idx)
    
    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-total/"
    preprocessSave(X, y2, outputDir, idx)
    def testBayesError(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]

        sampleSize = 100
        trainX, trainY = X[0:sampleSize, :], y[0:sampleSize]
        testX, testY = X[sampleSize:, :], y[sampleSize:]

        #We form a test set from the grid points
        gridX = numpy.zeros((gridPoints.shape[0]**2, 2))
        for m in range(gridPoints.shape[0]):
            gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints
            gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m]

        Cs = 2**numpy.arange(-5, 5, dtype=numpy.float)
        gammas = 2**numpy.arange(-5, 5, dtype=numpy.float)

        bestError = 1 

        for C in Cs:
            for gamma in gammas:
                svm = LibSVM(kernel="gaussian", C=C, kernelParam=gamma)
                svm.learnModel(trainX, trainY)
                predY, decisionsY = svm.predict(gridX, True)
                decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F")
                error = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X)

                predY, decisionsY = svm.predict(testX, True)
                error2 = Evaluator.binaryError(testY, predY)
                print(error, error2)

                if error < bestError:
                    error = bestError
                    bestC = C
                    bestGamma = gamma

        svm = LibSVM(kernel="gaussian", C=bestC, kernelParam=bestGamma)
        svm.learnModel(trainX, trainY)
        predY, decisionsY = svm.predict(gridX, True)

        plt.figure(0)
        plt.contourf(gridPoints, gridPoints, decisionGrid, 100)
        plt.colorbar()

        plt.figure(1)
        plt.scatter(X[y==1, 0], X[y==1, 1], c='r' ,label="-1")
        plt.scatter(X[y==-1, 0], X[y==-1, 1], c='b',label="+1")
        plt.legend()
        plt.show()
    def testReadFromCsvFile(self):
        dir = PathDefaults.getDataDir() + "test/"
        fileName = dir + "examplesList1.csv"

        examplesList = ExamplesList.readFromCsvFile(fileName)

        X = examplesList.getDataField(examplesList.getDefaultExamplesName())
        y = examplesList.getDataField(examplesList.getLabelsName())

        X2 = numpy.array([[10, 2], [4, -6], [24, 6]])
        y2 = numpy.array([[-1], [1], [-1]])

        self.assertTrue((X==X2).all())
        self.assertTrue((y==y2).all())
Exemple #24
0
def cluster():
    k1 = 20 # numCluster to learn
    k2 = 40 # numEigenVector kept

    dir = PathDefaults.getDataDir() + "cluster/"
    graphIterator = getBemolGraphIterator(dir)
    #===========================================
    # cluster
    print("compute clusters")
    clusterer = IterativeSpectralClustering(k1, k2)
    clustersList = clusterer.clusterFromIterator(graphIterator, True)

    for i in range(len(clustersList)):
              clusters = clustersList[i]
              print(clusters)
    def testRunSimulation(self):
        egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv"
        alterFileName = PathDefaults.getDataDir()  + "infoDiffusion/AlterData.csv"
        numVertices = 1000
        infoProb = 0.1
        p = 0.1
        neighbours = 10

        generator = SmallWorldGenerator(p, neighbours)
        graph = SparseGraph(VertexList(numVertices, 0))
        graph = generator.generate(graph)
        
        CVal = 1.0
        kernel = "linear"
        kernelParamVal = 0.0
        errorCost = 0.5
        folds = 6
        sampleSize = 1000

        maxIterations = 5

        self.svmEgoSimulator.trainClassifier(CVal, kernel, kernelParamVal, errorCost, sampleSize)
        self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph)
        self.svmEgoSimulator.runSimulation(maxIterations)
Exemple #26
0
    def readHIVGraph(self, undirected=True, indicators=True):
        """
        We will use pacdate5389.csv which contains the data of infection. The undirected
        parameter instructs whether to create an undirected graph. If indicators
        is true then categorical varibles are turned into collections of indicator
        ones. 
        """
        converters = {1: CsvConverters.dateConv, 3:CsvConverters.dateConv, 5:CsvConverters.detectionConv, 6:CsvConverters.provConv, 8: CsvConverters.dateConv }
        converters[9] = CsvConverters.genderConv
        converters[10] = CsvConverters.orientConv
        converters[11] = CsvConverters.numContactsConv
        converters[12] = CsvConverters.numContactsConv
        converters[13] = CsvConverters.numContactsConv

        def nanProcessor(X):
            means = numpy.zeros(X.shape[1])
            for i in range(X.shape[1]):
                if numpy.sum(numpy.isnan(X[:, i])) > 0:
                    logging.info("No. missing values in " + str(i) + "th column: " + str(numpy.sum(numpy.isnan(X[:, i]))))
                means[i] = numpy.mean(X[:, i][numpy.isnan(X[:, i]) == False])
                X[numpy.isnan(X[:, i]), i] = means[i]
            return X 

        idIndex = 0
        featureIndices = converters.keys()
        multiGraphCsvReader = MultiGraphCsvReader(idIndex, featureIndices, converters, nanProcessor)

        dataDir = PathDefaults.getDataDir()
        vertexFileName = dataDir + "HIV/alldata.csv"
        edgeFileNames = [dataDir + "HIV/grafdet2.csv", dataDir + "HIV/infect2.csv"]

        sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames, undirected, delimiter="\t")

        #For learning purposes we will convert categorial variables into a set of
        #indicator features
        if indicators: 
            logging.info("Converting categorial features")
            vList = sparseMultiGraph.getVertexList()
            V = vList.getVertices(list(range(vList.getNumVertices())))
            catInds = [2, 3]
            generator = FeatureGenerator()
            V = generator.categoricalToIndicator(V, catInds)
            vList.replaceVertices(V)

        logging.info("Created " + str(sparseMultiGraph.getNumVertices()) + " examples with " + str(sparseMultiGraph.getVertexList().getNumFeatures()) + " features")

        return sparseMultiGraph
Exemple #27
0
    def __init__(self, maxIter=None, iterStartTimeStamp=None): 
        """
        Return a training and test set for netflix based on the time each 
        rating was made. There are 62 iterations. 
        """ 
        self.timeStep = timedelta(30).total_seconds()  
        
        #startDate is used to convert dates into ints 
        #self.startDate = datetime(1998,1,1)
        #self.endDate = datetime(2005,12,31)
        
        #iterStartDate is the starting date of the iterator 
        if iterStartTimeStamp != None: 
            self.iterStartTimeStamp = iterStartTimeStamp
        else: 
            self.iterStartTimeStamp = time.mktime(datetime(2001,1,1).timetuple()) 

        self.startMovieID = 1 
        self.endMovieID = 17770
        
        self.numMovies = 17770
        self.numRatings = 100480507
        self.numProbeMovies = 16938
        self.numProbeRatings = 1408395
        self.numCustomers = 480189
        
        outputDir = PathDefaults.getOutputDir() + "recommend/netflix/"

        if not os.path.exists(outputDir): 
            os.mkdir(outputDir)
                
        self.ratingFileName = outputDir + "data.npz"  
        self.custDictFileName = outputDir + "custIdDict.pkl"
        self.probeFileName = PathDefaults.getDataDir() + "netflix/probe.txt"    
        self.testRatingsFileName = outputDir + "test_data.npz"
        self.isTrainRatingsFileName = outputDir + "is_train.npz"
        
        self.maxIter = maxIter 
        self.trainSplit = 4.0/5 

        self.processRatings()
        #self.processProbe()
        self.splitDataset()        
        self.loadProcessedData()
        
        if self.maxIter != None: 
            logging.debug("Maximum number of iterations: " + str(self.maxIter))
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]
        
        y = y*2 - 1 

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]
        
        testY = testY*2-1

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        numTrees = 5
        minSplit = 50 
        maxDepths = range(3, 10)
        trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347])
        testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801])

        i = 0
        
        #The results are approximately the same, but not exactly 
        for maxDepth in maxDepths:
            treeRankForest = TreeRankForest(self.leafRanklearner)
            treeRankForest.setMaxDepth(maxDepth)
            treeRankForest.setMinSplit(minSplit)
            treeRankForest.setNumTrees(numTrees)
            treeRankForest.learnModel(X, y)
            trainScores = treeRankForest.predict(X)
            testScores = treeRankForest.predict(testX)

            print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY))

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i+=1
    def testPredict2(self):
        #We play around with parameters to maximise AUC on the IGF1_0-Haar data
        dataDir = PathDefaults.getDataDir()
        fileName = dataDir + "IGF1_0-Haar.npy"

        XY = numpy.load(fileName)
        X = XY[:, 0:XY.shape[1]-1]
        y = XY[:, XY.shape[1]-1].ravel()

        weight = numpy.bincount(numpy.array(y, numpy.int))[0]/float(y.shape[0])
        #weight = 0.5
        #weight = 0.9

        folds = 3
        randomForest = RandomForest()
        randomForest.setWeight(weight)
        randomForest.setMaxDepth(50)
        #randomForest.setMinSplit(100)
        mean, var = randomForest.evaluateCv(X, y, folds, Evaluator.auc)
        logging.debug("AUC = " + str(mean))
        logging.debug("Var = " + str(var))
Exemple #30
0
def computeLearningRates(datasetNames, numProcesses, fileNameSuffix, learnerName, sampleSizes, foldsSet): 
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/"
    outputDir = PathDefaults.getOutputDir() + "modelPenalisation/"

    learner, loadMethod, dataDir, outputDir, paramDict = getSetup(learnerName, dataDir, outputDir, numProcesses)
    
    for i in range(len(datasetNames)):
        logging.debug("Learning using dataset " + datasetNames[i][0])
        outfileName = outputDir + datasetNames[i][0] + fileNameSuffix

        fileLock = FileLock(outfileName + ".npz")
        if not fileLock.isLocked() and not fileLock.fileExists():
            fileLock.lock()
            
            numRealisations = datasetNames[i][1]  
            gridShape = [numRealisations, sampleSizes.shape[0]]
            gridShape.extend(list(learner.gridShape(paramDict)))   
            gridShape = tuple(gridShape)            
            
            betaGrids = numpy.zeros(gridShape) 
            
            for k in range(sampleSizes.shape[0]):
                sampleSize = sampleSizes[k]
                
                logging.debug("Using sample size " + str(sampleSize))
                for j in range(numRealisations):
                        Util.printIteration(j, 1, numRealisations, "Realisation: ")
                        trainX, trainY, testX, testY = loadMethod(dataDir, datasetNames[i][0], j)
                        
                        numpy.random.seed(21)
                        trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize]
                        validX = trainX[trainInds,:]
                        validY = trainY[trainInds]
                        
                        betaGrids[j, k, :] = learner.learningRate(validX, validY, foldsSet, paramDict)
            
            numpy.savez(outfileName, betaGrids)
            logging.debug("Saved results as file " + outfileName + ".npz")
            fileLock.unlock()
    def testReadFile(self): 
        eCsvReader = EgoCsvReader()
        #logging.debug(os.getcwd())
        dir = PathDefaults.getDataDir()
        fileName = dir + "test/TestData.csv"
        questionIds = [("Q14", 0), ("Q12", 1) , ("Q2", 0)]

        missing = 1
        (X, titles) = eCsvReader.readFile(fileName, questionIds, missing)
        
        X2 = numpy.zeros((10, 3))
        X2[0, :] = [0.621903386,0.608560354,0.33290608]
        X2[1, :] = [0.318548924,0.402390713,0.129956291]
        X2[2, :] = [0.956658404,0.344317772,0.680386616]
        X2[3, :] = [0.267607668,0.119647983,0.116893619]
        X2[4, :] = [0.686589498,0.402390713,0.426789174]
        X2[5, :] = [0.373575769,0.025846789,0.797125005]
        X2[6, :] = [0.493793948,0.402390713,0.990507109]
        X2[7, :] = [0.524534585,0.525169385,0.772917183]
        X2[8, :] = [0.339055395,0.402390713,0.684788001]
        X2[9, :] = [0.997774183,0.790801992,0.643252009]
        
        self.assertAlmostEquals(numpy.linalg.norm(X-X2),0, places=6)
    def testReadGraph(self):

        dir = PathDefaults.getDataDir()
        vertexFileName = dir + "test/deggraf10.csv"
        edgeFileNames = [dir + "test/testEdges1.csv", dir + "test/testEdges2.csv"]

        def genderConv(x):
            genderDict = {'"M"': 0, '"F"': 1}
            return genderDict[x]

        def orientConv(x):
            orientDict = {'"HT"': 0, '"HB"': 1}
            return orientDict[x]

        def fteConv(x):
            fteDict = {'"INTER"': 0, '"CONTA"': 1}
            return fteDict[x]

        def provConv(x):
            provDict = {'"CH"': 0, '"SC"': 1, '"SS"': 2, '"LH"' : 3, '"GM"' : 4}
            return provDict[x]

        converters = {3: genderConv, 4: orientConv, 5:fteConv, 6:provConv}

        idIndex = 0
        featureIndices = list(range(1,11))
        multiGraphCsvReader = MultiGraphCsvReader(idIndex, featureIndices, converters)
        sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames)

        vertexValues = numpy.zeros((10, 10))
        vertexValues[0, :] = numpy.array([1986, 32, 0, 0, 0, 0, 0, 3, 3, 1])
        vertexValues[1, :] = numpy.array([1986, 27, 0, 0, 0, 1, 0, 4, 4, 1])
        vertexValues[2, :] = numpy.array([1986, 20, 0, 0, 0, 1, 0, 1, 1, 0])
        vertexValues[3, :] = numpy.array([1986, 20, 0, 0, 0, 1, 0, 2, 2, 0])
        vertexValues[4, :] = numpy.array([1986, 20, 0, 0, 0, 2, 0, 5, 5, 0])
        vertexValues[5, :] = numpy.array([1986, 28, 0, 0, 0, 3, 0, 1, 1, 1])
        vertexValues[6, :] = numpy.array([1986, 26, 1, 0, 1, 3, 6, 1, 1, 1])
        vertexValues[7, :] = numpy.array([1986, 35, 0, 0, 0, 2, 0, 0, 0, 0])
        vertexValues[8, :] = numpy.array([1986, 37, 0, 1, 0, 3, 0, 5, 3, 0])
        vertexValues[9, :] = numpy.array([1986, 40, 0, 1, 0, 4, 0, 3, 3, 0])

        #Check if the values of the vertices are correct 
        for i in range(sparseMultiGraph.getNumVertices()):
            self.assertTrue((sparseMultiGraph.getVertex(i) == vertexValues[i]).all())

        #Now check edges
        edges = numpy.zeros((10, 3))
        edges[0, :] = numpy.array([4, 0, 0])
        edges[1, :] = numpy.array([4, 1, 0])
        edges[2, :] = numpy.array([5, 3, 0])
        edges[3, :] = numpy.array([7, 1, 0])
        edges[4, :] = numpy.array([8, 0, 0])
        edges[5, :] = numpy.array([4, 1, 1])
        edges[6, :] = numpy.array([8, 1, 1])
        edges[7, :] = numpy.array([8, 2, 1])
        edges[8, :] = numpy.array([8, 4, 1])
        edges[9, :] = numpy.array([9, 0, 1])

        self.assertTrue((sparseMultiGraph.getAllEdges() == edges).all())

        #Now test directed graphs
        sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames, False)

        for i in range(sparseMultiGraph.getNumVertices()):
            self.assertTrue((sparseMultiGraph.getVertex(i) == vertexValues[i]).all())


        edges = numpy.zeros((10, 3))
        edges[0, :] = numpy.array([0, 4, 0])
        edges[1, :] = numpy.array([1, 7, 0])
        edges[2, :] = numpy.array([3, 5, 0])
        edges[3, :] = numpy.array([4, 1, 0])
        edges[4, :] = numpy.array([8, 0, 0])
        edges[5, :] = numpy.array([0, 9, 1])
        edges[6, :] = numpy.array([1, 8, 1])
        edges[7, :] = numpy.array([2, 8, 1])
        edges[8, :] = numpy.array([4, 1, 1])
        edges[9, :] = numpy.array([8, 4, 1])
        
        self.assertTrue((sparseMultiGraph.getAllEdges() == edges).all())
    def testMDLGraphsReader(self):
        reader = MDLGraphsReader()
        dir = PathDefaults.getDataDir()
        fileName = dir + "test/testGraphs.mdl"

        graphs = reader.readFromFile(fileName)
        self.assertEquals(len(graphs), 2)

        #Check the first graph
        self.assertEquals(graphs[0].getNumVertices(), 26)
        self.assertEquals(graphs[0].getNumEdges(), 28)

        def getEdge(graph, i, j):
            return graph.getEdge(i - 1, j - 1)

        self.assertEquals(getEdge(graphs[0], 1, 6), 1)
        self.assertEquals(getEdge(graphs[0], 1, 2), 1)
        self.assertEquals(getEdge(graphs[0], 1, 18), 1)
        self.assertEquals(getEdge(graphs[0], 2, 3), 1)
        self.assertEquals(getEdge(graphs[0], 2, 19), 1)
        self.assertEquals(getEdge(graphs[0], 3, 4), 1)
        self.assertEquals(getEdge(graphs[0], 3, 20), 1)
        self.assertEquals(getEdge(graphs[0], 4, 10), 1)
        self.assertEquals(getEdge(graphs[0], 4, 5), 1)
        self.assertEquals(getEdge(graphs[0], 5, 6), 1)
        self.assertEquals(getEdge(graphs[0], 5, 7), 1)
        self.assertEquals(getEdge(graphs[0], 6, 21), 1)
        self.assertEquals(getEdge(graphs[0], 7, 8), 1)
        self.assertEquals(getEdge(graphs[0], 7, 22), 1)
        self.assertEquals(getEdge(graphs[0], 8, 9), 1)
        self.assertEquals(getEdge(graphs[0], 8, 23), 1)
        self.assertEquals(getEdge(graphs[0], 9, 14), 1)
        self.assertEquals(getEdge(graphs[0], 9, 10), 1)
        self.assertEquals(getEdge(graphs[0], 10, 11), 1)
        self.assertEquals(getEdge(graphs[0], 11, 12), 1)
        self.assertEquals(getEdge(graphs[0], 11, 24), 1)
        self.assertEquals(getEdge(graphs[0], 12, 13), 1)
        self.assertEquals(getEdge(graphs[0], 12, 25), 1)
        self.assertEquals(getEdge(graphs[0], 13, 14), 1)
        self.assertEquals(getEdge(graphs[0], 13, 15), 1)
        self.assertEquals(getEdge(graphs[0], 14, 26), 1)
        self.assertEquals(getEdge(graphs[0], 15, 16), 1)
        self.assertEquals(getEdge(graphs[0], 15, 17), 1)

        #Check the second graph
        self.assertEquals(graphs[1].getNumVertices(), 19)
        self.assertEquals(graphs[1].getNumEdges(), 20)

        self.assertEquals(getEdge(graphs[1], 1, 10), 1)
        self.assertEquals(getEdge(graphs[1], 1, 2), 1)
        self.assertEquals(getEdge(graphs[1], 1, 14), 1)
        self.assertEquals(getEdge(graphs[1], 2, 3), 1)
        self.assertEquals(getEdge(graphs[1], 2, 15), 1)
        self.assertEquals(getEdge(graphs[1], 3, 8), 1)
        self.assertEquals(getEdge(graphs[1], 3, 4), 1)
        self.assertEquals(getEdge(graphs[1], 4, 5), 1)
        self.assertEquals(getEdge(graphs[1], 4, 16), 1)
        self.assertEquals(getEdge(graphs[1], 5, 6), 1)
        self.assertEquals(getEdge(graphs[1], 5, 17), 1)
        self.assertEquals(getEdge(graphs[1], 6, 7), 1)
        self.assertEquals(getEdge(graphs[1], 6, 18), 1)
        self.assertEquals(getEdge(graphs[1], 7, 8), 1)
        self.assertEquals(getEdge(graphs[1], 8, 9), 1)
        self.assertEquals(getEdge(graphs[1], 9, 10), 1)
        self.assertEquals(getEdge(graphs[1], 9, 11), 1)
        self.assertEquals(getEdge(graphs[1], 10, 19), 1)
        self.assertEquals(getEdge(graphs[1], 11, 12), 1)
        self.assertEquals(getEdge(graphs[1], 11, 13), 1)
 def testGetDataDir(self):
     print((PathDefaults.getDataDir()))
    def testReadFromFile(self):
        vertex1Indices = [0, 2, 3, 4, 5]
        vertex2Indices = [1, 6, 7, 8, 9]

        def genderConv(x):
            genderDict = {'"M"': 0, '"F"': 1}
            return genderDict[x]

        def orientConv(x):
            orientDict = {'"HT"': 0, '"HB"': 1}
            return orientDict[x]

        converters = {2: genderConv, 6: genderConv, 3:orientConv, 7:orientConv}

        csvGraphReader = CsvGraphReader(vertex1Indices, vertex2Indices, converters)

        dir = PathDefaults.getDataDir()
        fileName = dir + "test/infect5.csv"

        graph = csvGraphReader.readFromFile(fileName)

        self.assertTrue((graph.getVertex(0) == numpy.array([0, 0, 28, 1])).all())
        self.assertTrue((graph.getVertex(1) == numpy.array([1, 0, 26, 1])).all())
        self.assertTrue((graph.getVertex(2) == numpy.array([0, 1, 42, 2])).all())
        self.assertTrue((graph.getVertex(3) == numpy.array([1, 0, 33, 1])).all())
        self.assertTrue((graph.getVertex(4) == numpy.array([0, 1, 35, 37])).all())

        self.assertTrue(graph.getEdge(0, 1) == 1)
        self.assertTrue(graph.getEdge(2, 3) == 1)
        self.assertTrue(graph.getEdge(4, 6) == 1)
        self.assertTrue(graph.getEdge(6, 7) == 1)
        self.assertTrue(graph.getEdge(5, 8) == 1)

        self.assertEquals(graph.getNumEdges(), 5)
        self.assertTrue(graph.isUndirected())

        #Test a directed graph
        csvGraphReader = CsvGraphReader(vertex1Indices, vertex2Indices, converters, undirected=False)
        graph = csvGraphReader.readFromFile(fileName)

        self.assertTrue(graph.getEdge(1, 0) == None)
        self.assertTrue(graph.getEdge(3, 2) == None)
        self.assertTrue(graph.getEdge(6, 4) == None)
        self.assertTrue(graph.getEdge(7, 6) == None)
        self.assertTrue(graph.getEdge(8, 5) == None)

        self.assertEquals(graph.getNumEdges(), 5)
        self.assertFalse(graph.isUndirected())

        #Test graph with no vertex information
        vertex1Indices = [0]
        vertex2Indices = [1]
        fileName = dir + "test/infect5-0.csv"
        csvGraphReader = CsvGraphReader(vertex1Indices, vertex2Indices, {})
        graph = csvGraphReader.readFromFile(fileName)

        self.assertTrue(graph.getEdge(0, 1) == 1)
        self.assertTrue(graph.getEdge(2, 3) == 1)
        self.assertTrue(graph.getEdge(4, 6) == 1)
        self.assertTrue(graph.getEdge(6, 7) == 1)
        self.assertTrue(graph.getEdge(5, 8) == 1)

        self.assertEquals(graph.getNumEdges(), 5)
        self.assertTrue(graph.isUndirected())
        self.assertEquals(graph.getVertexList().getNumFeatures(), 0)