コード例 #1
0
ファイル: DBLPDataset.py プロジェクト: pierrebo/wallhack
    def __init__(self, field):
        numpy.random.seed(21)        
        
        dataDir = PathDefaults.getDataDir() + "dblp/"
        self.xmlFileName = dataDir + "dblp.xml"
        self.xmlCleanFilename = dataDir + "dblpClean.xml"        

        resultsDir = PathDefaults.getDataDir() + "reputation/" + field + "/"
        self.expertsFileName = resultsDir + "experts.txt"
        self.expertMatchesFilename = resultsDir + "experts_matches.csv"
        self.trainExpertMatchesFilename = resultsDir + "experts_train_matches.csv"
        self.testExpertMatchesFilename = resultsDir + "experts_test_matches.csv"
        self.coauthorsFilename = resultsDir + "coauthors.csv"
        self.publicationsFilename = resultsDir + "publications.csv"
        
        self.stepSize = 100000
        self.numLines = 33532888
        self.publicationTypes = set(["article" , "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"])
        self.p = 0.5     
        self.matchCutoff = 0.95
        
        
        self.cleanXML()
        self.matchExperts()
        logging.warning("Now you must disambiguate the matched experts if not ready done")        
コード例 #2
0
def processSimpleDataset(name, numRealisations, split, ext=".csv", delimiter=",", usecols=None, skiprows=1, converters=None):
    numpy.random.seed(21)
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/"
    fileName = dataDir + name + ext
    
    print("Loading data from file " + fileName)
    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "/"

    XY = numpy.loadtxt(fileName, delimiter=delimiter, skiprows=skiprows, usecols=usecols, converters=converters)
    X = XY[:, :-1]
    y = XY[:, -1]
    idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split)
    preprocessSave(X, y, outputDir, idx)
コード例 #3
0
    def testGenerateRandomGraph(self):
        egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv"
        alterFileName = PathDefaults.getDataDir()  + "infoDiffusion/AlterData.csv"
        numVertices = 1000
        infoProb = 0.1

        
        p = 0.1
        neighbours = 10
        generator = SmallWorldGenerator(p, neighbours)
        graph = SparseGraph(VertexList(numVertices, 0))
        graph = generator.generate(graph)

        self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph)
コード例 #4
0
 def __init__(self):
     self.labelNames = ["Cortisol.val", "Testosterone.val", "IGF1.val"]
     self.dataDir = PathDefaults.getDataDir() +  "metabolomic/"
     self.boundsDict = {}
     self.boundsDict["Cortisol"] = numpy.array([0, 89, 225, 573])
     self.boundsDict["Testosterone"] = numpy.array([0, 3, 9, 13])
     self.boundsDict["IGF1"] = numpy.array([0, 200, 441, 782])
コード例 #5
0
    def testComputeIdealPenalty(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]

        sampleSize = 100
        trainX, trainY = X[0:sampleSize, :], y[0:sampleSize]
        testX, testY = X[sampleSize:, :], y[sampleSize:]

        #We form a test set from the grid points
        fullX = numpy.zeros((gridPoints.shape[0]**2, 2))
        for m in range(gridPoints.shape[0]):
            fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints
            fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m]

        C = 1.0
        gamma = 1.0
        args = (trainX, trainY, fullX, C, gamma, gridPoints, pdfX, pdfY1X, pdfYminus1X)
        penalty = computeIdealPenalty(args)


        #Now compute penalty using data
        args = (trainX, trainY, testX, testY, C, gamma)
        penalty2 = computeIdealPenalty2(args)

        self.assertAlmostEquals(penalty2, penalty, 2)
コード例 #6
0
ファイル: BemolData.py プロジェクト: malcolmreynolds/APGL
 def main(argv=None):
     if argv is None:
         argv = sys.argv
     try:
         # read options
         try:
             opts, args = getopt.getopt(argv[1:], "hd:n:D", ["help", "dir=", "nb_user="******"debug"])
         except getopt.error as msg:
              raise RGUsage(msg)
         # apply options
         dir = PathDefaults.getDataDir() + "cluster/"
         nb_user = None
         log_level = logging.INFO
         for o, a in opts:
             if o in ("-h", "--help"):
                 print(__doc__)
                 return 0
             elif o in ("-d", "--dir"):
                 dir = a
             elif o in ("-n", "--nb_user"):
                 nb_user = int(a)
             elif o in ("-D", "--debug"):
                 log_level = logging.DEBUG
         logging.basicConfig(stream=sys.stdout, level=log_level, format='%(levelname)s (%(asctime)s):%(message)s')
         # process: generate data files
         BemolData.generate_data_file(dir, nb_user)
     except RGUsage as err:
         logging.error(err.msg)
         logging.error("for help use --help")
         return 2
コード例 #7
0
    def testToyData(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]


        pxSum = 0
        pY1XSum = 0
        pYminus1XSum = 0

        px2Sum = 0 
        squareArea = (gridPoints[1]-gridPoints[0])**2

        for i in range(gridPoints.shape[0]-1):
            for j in range(gridPoints.shape[0]-1):
                px = (pdfX[i,j]+pdfX[i+1,j]+pdfX[i, j+1]+pdfX[i+1, j+1])/4
                pxSum += px*squareArea

                pY1X = (pdfY1X[i,j]+pdfY1X[i+1,j]+pdfY1X[i, j+1]+pdfY1X[i+1, j+1])/4
                pY1XSum += pY1X*squareArea

                pYminus1X = (pdfYminus1X[i,j]+pdfYminus1X[i+1,j]+pdfYminus1X[i, j+1]+pdfYminus1X[i+1, j+1])/4
                pYminus1XSum += pYminus1X*squareArea

                px2Sum += px*pY1X*squareArea + px*pYminus1X*squareArea

        self.assertAlmostEquals(pxSum, 1)
        print(pY1XSum)
        print(pYminus1XSum)

        self.assertAlmostEquals(px2Sum, 1)
コード例 #8
0
ファイル: TreeRankTest.py プロジェクト: malcolmreynolds/APGL
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        maxDepths = range(3, 10)
        trainAucs = numpy.array([0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508])
        testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400])
        i = 0
        
        #The results are approximately the same, but not exactly 
        for maxDepth in maxDepths:
            treeRank = TreeRank(self.leafRanklearner)
            treeRank.setMaxDepth(maxDepth)
            treeRank.learnModel(X, y)
            trainScores = treeRank.predict(X)
            testScores = treeRank.predict(testX)

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i+=1 
コード例 #9
0
    def testEdgeFile(self):
        """
        Figure out the problem with the edge file 
        """
        dataDir = PathDefaults.getDataDir() + "cluster/"
        edgesFilename = dataDir + "Cit-HepTh.txt"

        edges = {}
        file = open(edgesFilename, 'r')
        file.readline()
        file.readline()
        file.readline()
        file.readline()

        vertices = {}

        for line in file:
            (vertex1, sep, vertex2) = line.partition("\t")
            vertex1 = vertex1.strip()
            vertex2 = vertex2.strip()
            edges[(vertex1, vertex2)] = 0
            vertices[vertex1] = 0
            vertices[vertex2] = 0

        #It says there are 352807 edges in paper and 27770 vertices
        self.assertEquals(len(edges), 352807)
        self.assertEquals(len(vertices), 27770)
コード例 #10
0
ファイル: EgoUtilsTest.py プロジェクト: malcolmreynolds/APGL
 def testGraphFromMatFile(self):
     matFileName = PathDefaults.getDataDir() +  "infoDiffusion/EgoAlterTransmissions1000.mat"
     sGraph = EgoUtils.graphFromMatFile(matFileName)
     
     examplesList = ExamplesList.readFromMatFile(matFileName)
     numFeatures = examplesList.getDataFieldSize("X", 1)
     
     self.assertEquals(examplesList.getNumExamples(), sGraph.getNumEdges())
     self.assertEquals(examplesList.getNumExamples()*2, sGraph.getNumVertices())
     self.assertEquals(numFeatures/2+1, sGraph.getVertexList().getNumFeatures())
     
     #Every even vertex has information, odd does not 
     for i in range(0, sGraph.getNumVertices()): 
         vertex = sGraph.getVertex(i)
         
         if i%2 == 0: 
             self.assertEquals(vertex[sGraph.getVertexList().getNumFeatures()-1], 1)
         else: 
             self.assertEquals(vertex[sGraph.getVertexList().getNumFeatures()-1], 0)
             
     #Test the first few vertices are the same 
     for i in range(0, 10): 
         vertex1 = sGraph.getVertex(i*2)[0:numFeatures/2]
         vertex2 = sGraph.getVertex(i*2+1)[0:numFeatures/2]
         vertexEx1 = examplesList.getSubDataField("X", numpy.array([i])).ravel()[0:numFeatures/2]
         vertexEx2 = examplesList.getSubDataField("X", numpy.array([i])).ravel()[numFeatures/2:numFeatures]
         
         self.assertTrue((vertex1 == vertexEx1).all())
         self.assertTrue((vertex2 == vertexEx2).all())
コード例 #11
0
    def __init__(self, maxIter=None, iterStartTimeStamp=None): 
        outputDir = PathDefaults.getOutputDir() + "recommend/erasm/"

        if not os.path.exists(outputDir): 
            os.mkdir(outputDir)
            
        #iterStartDate is the starting date of the iterator 
        if iterStartTimeStamp != None: 
            self.iterStartTimeStamp = iterStartTimeStamp
        else: 
            self.iterStartTimeStamp = 1286229600
            
        self.timeStep = timedelta(30).total_seconds()             
                
        self.ratingFileName = outputDir + "data.npz"          
        self.userDictFileName = outputDir + "userIdDict.pkl"   
        self.groupDictFileName = outputDir + "groupIdDict.pkl" 
        self.isTrainRatingsFileName = outputDir + "is_train.npz"
    
        self.dataDir = PathDefaults.getDataDir() + "erasm/"
        self.dataFileName = self.dataDir + "groupMembers-29-11-12" 
        
        self.maxIter = maxIter 
        self.trainSplit = 4.0/5 
        
        self.processRatings()
        self.splitDataset()        
        self.loadProcessedData()
コード例 #12
0
ファイル: NetflixDataset.py プロジェクト: pierrebo/wallhack
    def processRatings(self): 
        """
        Convert the dataset into a matrix and save the results for faster 
        access. 
        """
        if not os.path.exists(self.ratingFileName) or not os.path.exists(self.custDictFileName): 
            dataDir = PathDefaults.getDataDir() + "netflix/training_set/"

            logging.debug("Processing ratings given in " + dataDir)

            custIdDict = {} 
            custIdSet = set([])        
            
            movieIds = array.array("I")
            custIds = array.array("I")
            ratings = array.array("B")
            dates = array.array("L")
            j = 0
            
            for i in range(self.startMovieID, self.endMovieID+1): 
                Util.printIteration(i-1, 1, self.endMovieID-1)
                ratingsFile = open(dataDir + "mv_" + str(i).zfill(7) + ".txt")
                ratingsFile.readline()
                
                for line in ratingsFile: 
                    vals = line.split(",")
                    
                    custId = int(vals[0])
                    
                    if custId not in custIdSet: 
                        custIdSet.add(custId)
                        custIdDict[custId] = j
                        custInd = j 
                        j += 1 
                    else: 
                        custInd = custIdDict[custId]
                    
                    rating = int(vals[1])     
                    t = datetime.strptime(vals[2].strip(), "%Y-%m-%d")
                
                    movieIds.append(i-1)
                    custIds.append(custInd)   
                    ratings.append(rating)
                    dates.append(int(time.mktime(t.timetuple()))) 
                    
            movieIds = numpy.array(movieIds, numpy.uint32)
            custIds = numpy.array(custIds, numpy.uint32)
            ratings = numpy.array(ratings, numpy.uint8)
            dates = numpy.array(dates, numpy.uint32)
            
            assert ratings.shape[0] == self.numRatings            
            
            numpy.savez(self.ratingFileName, movieIds, custIds, ratings, dates) 
            logging.debug("Saved ratings file as " + self.ratingFileName)
            
            pickle.dump(custIdDict, open(self.custDictFileName, 'wb'))
            logging.debug("Saved custIdDict as " + self.custDictFileName)
        else: 
            logging.debug("Ratings file " + str(self.ratingFileName) + " already processed")
コード例 #13
0
    def loadData():
        """
        Return the raw spectra and the MDS transformed data as well as the DataFrame
        for the MDS data. 
        """
        utilsLib = importr('utils')

        dataDir = PathDefaults.getDataDir() +  "metabolomic/"
        fileName = dataDir + "data.RMN.total.6.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        maxNMRIndex = 951
        X = df.rx(robjects.IntVector(range(1, maxNMRIndex)))
        X = numpy.array(X).T

        #Load age and normalise (missing values are assinged the mean) 
        ages = numpy.array(df.rx(robjects.StrVector(["Age"]))).ravel()
        meanAge = numpy.mean(ages[numpy.logical_not(numpy.isnan(ages))])
        ages[numpy.isnan(ages)] = meanAge
        ages = Standardiser().standardiseArray(ages)

        Xs = X.copy()
        standardiser = Standardiser()
        Xs = standardiser.standardiseArray(X)

        fileName = dataDir + "data.sportsmen.log.AP.1.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        maxNMRIndex = 419
        X2 = df.rx(robjects.IntVector(range(1, maxNMRIndex)))
        X2 = numpy.array(X2).T

        #Load the OPLS corrected files
        fileName = dataDir + "IGF1.log.OSC.1.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        minNMRIndex = 22
        maxNMRIndex = 441
        Xopls1 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex)))
        Xopls1 = numpy.array(Xopls1).T

        fileName = dataDir + "cort.log.OSC.1.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        minNMRIndex = 20
        maxNMRIndex = 439
        Xopls2 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex)))
        Xopls2 = numpy.array(Xopls2).T

        fileName = dataDir + "testo.log.OSC.1.txt"
        df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",")
        minNMRIndex = 22
        maxNMRIndex = 441
        Xopls3 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex)))
        Xopls3 = numpy.array(Xopls3).T

        #Let's load all the label data here
        labelNames = MetabolomicsUtils.getLabelNames()
        YList = MetabolomicsUtils.createLabelList(df, labelNames)
        
        return X, X2, Xs, (Xopls1, Xopls2, Xopls3), YList, ages, df
コード例 #14
0
ファイル: DatasetStats.py プロジェクト: malcolmreynolds/APGL
 def getIterator(): 
     dataDir = PathDefaults.getDataDir() + "cluster/"
     
     nbUser = 10000 # set to 'None' to have all users
     nbPurchasesPerIt = 500 # set to 'None' to take all the purchases per date
     startingIteration = 300
     endingIteration = 600 # set to 'None' to have all iterations
     stepSize = 1    
     
     return itertools.islice(BemolData.getGraphIterator(dataDir, nbUser, nbPurchasesPerIt), startingIteration, endingIteration, stepSize)
コード例 #15
0
 def profileClusterFromIterator(self):
     iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList)
     dataDir = PathDefaults.getDataDir() + "cluster/"
     #iterator = getBemolGraphIterator(dataDir)
     
     def run(): 
         clusterList, timeList, boundList = self.clusterer.clusterFromIterator(iterator, verbose=True)
         print(timeList.cumsum(0))
         
     ProfileUtils.profile('run()', globals(), locals())
コード例 #16
0
    def testReadGraph(self):
        fileName = PathDefaults.getDataDir() + "test/simpleGraph.txt"

        graphReader = SimpleGraphReader()
        graph = graphReader.readFromFile(fileName)

        logging.debug((graph.getAllEdges()))

        self.assertEquals(graph.isUndirected(), True)
        self.assertEquals(graph.getNumVertices(), 5)
        self.assertEquals(graph.getNumEdges(), 4)

        self.assertEquals(graph.getEdge(0, 1), 1)
        self.assertEquals(graph.getEdge(2, 4), 1)
        self.assertEquals(graph.getEdge(2, 2), 1)
        self.assertEquals(graph.getEdge(4, 0), 1)

        #Now test reading a file with the same graph but vertices indexed differently
        fileName = PathDefaults.getDataDir() + "test/simpleGraph2.txt"
        graph = graphReader.readFromFile(fileName)

        self.assertEquals(graph.isUndirected(), True)
        self.assertEquals(graph.getNumVertices(), 5)
        self.assertEquals(graph.getNumEdges(), 4)

        self.assertEquals(graph.getEdge(0, 1), 1.1)
        self.assertEquals(graph.getEdge(2, 4), 1)
        self.assertEquals(graph.getEdge(2, 2), 1.6)
        self.assertEquals(graph.getEdge(4, 0), 1)

        #Now test a file with directed edges
        fileName = PathDefaults.getDataDir() + "test/simpleGraph3.txt"
        graph = graphReader.readFromFile(fileName)

        self.assertEquals(graph.isUndirected(), False)
        self.assertEquals(graph.getNumVertices(), 5)
        self.assertEquals(graph.getNumEdges(), 4)

        self.assertEquals(graph.getEdge(0, 1), 1)
        self.assertEquals(graph.getEdge(2, 4), 1)
        self.assertEquals(graph.getEdge(2, 2), 1)
        self.assertEquals(graph.getEdge(4, 0), 1)
コード例 #17
0
    def testReadGraph(self):
        fileName = PathDefaults.getDataDir() +  "test/simpleGraph.txt"

        graphReader = SimpleGraphReader()
        graph = graphReader.readFromFile(fileName)

        logging.debug((graph.getAllEdges()))

        self.assertEquals(graph.isUndirected(), True)
        self.assertEquals(graph.getNumVertices(), 5)
        self.assertEquals(graph.getNumEdges(), 4)

        self.assertEquals(graph.getEdge(0, 1), 1)
        self.assertEquals(graph.getEdge(2, 4), 1)
        self.assertEquals(graph.getEdge(2, 2), 1)
        self.assertEquals(graph.getEdge(4, 0), 1)

        #Now test reading a file with the same graph but vertices indexed differently
        fileName = PathDefaults.getDataDir() + "test/simpleGraph2.txt"
        graph = graphReader.readFromFile(fileName)

        self.assertEquals(graph.isUndirected(), True)
        self.assertEquals(graph.getNumVertices(), 5)
        self.assertEquals(graph.getNumEdges(), 4)

        self.assertEquals(graph.getEdge(0, 1), 1.1)
        self.assertEquals(graph.getEdge(2, 4), 1)
        self.assertEquals(graph.getEdge(2, 2), 1.6)
        self.assertEquals(graph.getEdge(4, 0), 1)

        #Now test a file with directed edges
        fileName = PathDefaults.getDataDir() +  "test/simpleGraph3.txt"
        graph = graphReader.readFromFile(fileName)

        self.assertEquals(graph.isUndirected(), False)
        self.assertEquals(graph.getNumVertices(), 5)
        self.assertEquals(graph.getNumEdges(), 4)

        self.assertEquals(graph.getEdge(0, 1), 1)
        self.assertEquals(graph.getEdge(2, 4), 1)
        self.assertEquals(graph.getEdge(2, 2), 1)
        self.assertEquals(graph.getEdge(4, 0), 1)
コード例 #18
0
 def __init__(self):
     dataDir = PathDefaults.getDataDir() + "cluster/"
     nbUser = 2000 # set to 'None' to have all users
     nbPurchasesPerIt = 50 # set to 'None' to take all the purchases
                                           # per date
     startingIteration = 20
     endingIteration = None # set to 'None' to have all iterations
     stepSize = 10    
     
     iterator = itertools.islice(BemolData.getGraphIterator(dataDir, nbUser, nbPurchasesPerIt), startingIteration, endingIteration, stepSize)
     self.iterator = iterator 
コード例 #19
0
    def testGetTrainIteratorFunc(self):
        dataFilename = PathDefaults.getDataDir() + "reference/author_document_count" 
        dataset = Static2IdValDataset(dataFilename)

        trainIterator = dataset.getTrainIteratorFunc()()      
        testIterator = dataset.getTestIteratorFunc()()
        
        for trainX in trainIterator: 
            testX = testIterator.next() 
            
            print(trainX.shape, trainX.nnz, testX.nnz)
            self.assertEquals(trainX.shape, testX.shape)
コード例 #20
0
    def testCreateIndicatorLabels(self):
        metaUtils = MetabolomicsUtils()
        X, XStd, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData()
        
        YCortisol = YCortisol[numpy.logical_not(numpy.isnan(YCortisol))]
        YCortisolIndicators = metaUtils.createIndicatorLabel(YCortisol, metaUtils.boundsDict["Cortisol"])
        
        YTesto = YTesto[numpy.logical_not(numpy.isnan(YTesto))]
        YTestoIndicators = metaUtils.createIndicatorLabel(YTesto, metaUtils.boundsDict["Testosterone"])
        
        YIgf1 = YIgf1[numpy.logical_not(numpy.isnan(YIgf1))]
        YIgf1Indicators = metaUtils.createIndicatorLabel(YIgf1, metaUtils.boundsDict["IGF1"])

        s = numpy.sum(YCortisolIndicators, 1)
        nptst.assert_array_equal(s, numpy.ones(s.shape[0]))

        s = numpy.sum(YTestoIndicators, 1)
        nptst.assert_array_equal(s, numpy.ones(s.shape[0]))

        s = numpy.sum(YIgf1Indicators, 1)
        nptst.assert_array_equal(s, numpy.ones(s.shape[0]))

        #Now compare to those labels in the file
        X, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData()
        dataDir = PathDefaults.getDataDir() +  "metabolomic/"
        fileName = dataDir + "data.RMN.total.6.txt"
        data = pandas.read_csv(fileName, delimiter=",") 

        YCortisolIndicators = metaUtils.createIndicatorLabel(YCortisol, metaUtils.boundsDict["Cortisol"])
        YCortisolIndicators2 = numpy.array(data[["Ind.Cortisol.1", "Ind.Cortisol.2", "Ind.Cortisol.3"]])
        
        for i in range(YCortisolIndicators.shape[0]): 
            if not numpy.isnan(YCortisol[i]) and not numpy.isnan(YCortisolIndicators2[i, :]).any(): 
                #nptst.assert_almost_equal(YCortisolIndicators2[i, :], YCortisolIndicators[i, :])
                pass 
        
        YTestoIndicators = metaUtils.createIndicatorLabel(YTesto, metaUtils.boundsDict["Testosterone"])
        YTestoIndicators2 = numpy.array(data[["Ind.Testo.1", "Ind.Testo.2", "Ind.Testo.3"]])
        
        for i in range(YTestoIndicators.shape[0]): 
            if not numpy.isnan(YTesto[i]) and not numpy.isnan(YTestoIndicators2[i, :]).any(): 
                #print(i, YTesto[i])
                nptst.assert_almost_equal(YTestoIndicators2[i, :], YTestoIndicators[i, :])
                
        YIgf1Indicators = metaUtils.createIndicatorLabel(YIgf1, metaUtils.boundsDict["IGF1"])
        YIgf1Indicators2 = numpy.array(data[["Ind.IGF1.1", "Ind.IGF1.2", "Ind.IGF1.3"]])
        
        for i in range(YIgf1Indicators.shape[0]): 
            if not numpy.isnan(YIgf1[i]) and not numpy.isnan(YIgf1Indicators2[i, :]).any(): 
                #print(i, YIgf1[i])
                #nptst.assert_almost_equal(YIgf1Indicators2[i, :], YIgf1Indicators[i, :])
                pass
コード例 #21
0
def processParkinsonsDataset(name, numRealisations):
    numpy.random.seed(21)
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/"
    fileName = dataDir + name + ".data"
    

    XY = numpy.loadtxt(fileName, delimiter=",", skiprows=1)
    inds = list(set(range(XY.shape[1])) - set([5, 6]))
    X = XY[:, inds]

    y1 = XY[:, 5]
    y2 = XY[:, 6]
    #We don't keep whole collections of patients
    split = 0.5

    idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split)

    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-motor/"
    preprocessSave(X, y1, outputDir, idx)
    
    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-total/"
    preprocessSave(X, y2, outputDir, idx)
コード例 #22
0
    def testBayesError(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]

        sampleSize = 100
        trainX, trainY = X[0:sampleSize, :], y[0:sampleSize]
        testX, testY = X[sampleSize:, :], y[sampleSize:]

        #We form a test set from the grid points
        gridX = numpy.zeros((gridPoints.shape[0]**2, 2))
        for m in range(gridPoints.shape[0]):
            gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints
            gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m]

        Cs = 2**numpy.arange(-5, 5, dtype=numpy.float)
        gammas = 2**numpy.arange(-5, 5, dtype=numpy.float)

        bestError = 1 

        for C in Cs:
            for gamma in gammas:
                svm = LibSVM(kernel="gaussian", C=C, kernelParam=gamma)
                svm.learnModel(trainX, trainY)
                predY, decisionsY = svm.predict(gridX, True)
                decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F")
                error = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X)

                predY, decisionsY = svm.predict(testX, True)
                error2 = Evaluator.binaryError(testY, predY)
                print(error, error2)

                if error < bestError:
                    error = bestError
                    bestC = C
                    bestGamma = gamma

        svm = LibSVM(kernel="gaussian", C=bestC, kernelParam=bestGamma)
        svm.learnModel(trainX, trainY)
        predY, decisionsY = svm.predict(gridX, True)

        plt.figure(0)
        plt.contourf(gridPoints, gridPoints, decisionGrid, 100)
        plt.colorbar()

        plt.figure(1)
        plt.scatter(X[y==1, 0], X[y==1, 1], c='r' ,label="-1")
        plt.scatter(X[y==-1, 0], X[y==-1, 1], c='b',label="+1")
        plt.legend()
        plt.show()
コード例 #23
0
    def testReadFromCsvFile(self):
        dir = PathDefaults.getDataDir() + "test/"
        fileName = dir + "examplesList1.csv"

        examplesList = ExamplesList.readFromCsvFile(fileName)

        X = examplesList.getDataField(examplesList.getDefaultExamplesName())
        y = examplesList.getDataField(examplesList.getLabelsName())

        X2 = numpy.array([[10, 2], [4, -6], [24, 6]])
        y2 = numpy.array([[-1], [1], [-1]])

        self.assertTrue((X==X2).all())
        self.assertTrue((y==y2).all())
コード例 #24
0
ファイル: BemolData.py プロジェクト: malcolmreynolds/APGL
def cluster():
    k1 = 20 # numCluster to learn
    k2 = 40 # numEigenVector kept

    dir = PathDefaults.getDataDir() + "cluster/"
    graphIterator = getBemolGraphIterator(dir)
    #===========================================
    # cluster
    print("compute clusters")
    clusterer = IterativeSpectralClustering(k1, k2)
    clustersList = clusterer.clusterFromIterator(graphIterator, True)

    for i in range(len(clustersList)):
              clusters = clustersList[i]
              print(clusters)
コード例 #25
0
    def testRunSimulation(self):
        egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv"
        alterFileName = PathDefaults.getDataDir()  + "infoDiffusion/AlterData.csv"
        numVertices = 1000
        infoProb = 0.1
        p = 0.1
        neighbours = 10

        generator = SmallWorldGenerator(p, neighbours)
        graph = SparseGraph(VertexList(numVertices, 0))
        graph = generator.generate(graph)
        
        CVal = 1.0
        kernel = "linear"
        kernelParamVal = 0.0
        errorCost = 0.5
        folds = 6
        sampleSize = 1000

        maxIterations = 5

        self.svmEgoSimulator.trainClassifier(CVal, kernel, kernelParamVal, errorCost, sampleSize)
        self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph)
        self.svmEgoSimulator.runSimulation(maxIterations)
コード例 #26
0
ファイル: HIVGraphReader.py プロジェクト: pierrebo/wallhack
    def readHIVGraph(self, undirected=True, indicators=True):
        """
        We will use pacdate5389.csv which contains the data of infection. The undirected
        parameter instructs whether to create an undirected graph. If indicators
        is true then categorical varibles are turned into collections of indicator
        ones. 
        """
        converters = {1: CsvConverters.dateConv, 3:CsvConverters.dateConv, 5:CsvConverters.detectionConv, 6:CsvConverters.provConv, 8: CsvConverters.dateConv }
        converters[9] = CsvConverters.genderConv
        converters[10] = CsvConverters.orientConv
        converters[11] = CsvConverters.numContactsConv
        converters[12] = CsvConverters.numContactsConv
        converters[13] = CsvConverters.numContactsConv

        def nanProcessor(X):
            means = numpy.zeros(X.shape[1])
            for i in range(X.shape[1]):
                if numpy.sum(numpy.isnan(X[:, i])) > 0:
                    logging.info("No. missing values in " + str(i) + "th column: " + str(numpy.sum(numpy.isnan(X[:, i]))))
                means[i] = numpy.mean(X[:, i][numpy.isnan(X[:, i]) == False])
                X[numpy.isnan(X[:, i]), i] = means[i]
            return X 

        idIndex = 0
        featureIndices = converters.keys()
        multiGraphCsvReader = MultiGraphCsvReader(idIndex, featureIndices, converters, nanProcessor)

        dataDir = PathDefaults.getDataDir()
        vertexFileName = dataDir + "HIV/alldata.csv"
        edgeFileNames = [dataDir + "HIV/grafdet2.csv", dataDir + "HIV/infect2.csv"]

        sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames, undirected, delimiter="\t")

        #For learning purposes we will convert categorial variables into a set of
        #indicator features
        if indicators: 
            logging.info("Converting categorial features")
            vList = sparseMultiGraph.getVertexList()
            V = vList.getVertices(list(range(vList.getNumVertices())))
            catInds = [2, 3]
            generator = FeatureGenerator()
            V = generator.categoricalToIndicator(V, catInds)
            vList.replaceVertices(V)

        logging.info("Created " + str(sparseMultiGraph.getNumVertices()) + " examples with " + str(sparseMultiGraph.getVertexList().getNumFeatures()) + " features")

        return sparseMultiGraph
コード例 #27
0
ファイル: NetflixDataset.py プロジェクト: pierrebo/wallhack
    def __init__(self, maxIter=None, iterStartTimeStamp=None): 
        """
        Return a training and test set for netflix based on the time each 
        rating was made. There are 62 iterations. 
        """ 
        self.timeStep = timedelta(30).total_seconds()  
        
        #startDate is used to convert dates into ints 
        #self.startDate = datetime(1998,1,1)
        #self.endDate = datetime(2005,12,31)
        
        #iterStartDate is the starting date of the iterator 
        if iterStartTimeStamp != None: 
            self.iterStartTimeStamp = iterStartTimeStamp
        else: 
            self.iterStartTimeStamp = time.mktime(datetime(2001,1,1).timetuple()) 

        self.startMovieID = 1 
        self.endMovieID = 17770
        
        self.numMovies = 17770
        self.numRatings = 100480507
        self.numProbeMovies = 16938
        self.numProbeRatings = 1408395
        self.numCustomers = 480189
        
        outputDir = PathDefaults.getOutputDir() + "recommend/netflix/"

        if not os.path.exists(outputDir): 
            os.mkdir(outputDir)
                
        self.ratingFileName = outputDir + "data.npz"  
        self.custDictFileName = outputDir + "custIdDict.pkl"
        self.probeFileName = PathDefaults.getDataDir() + "netflix/probe.txt"    
        self.testRatingsFileName = outputDir + "test_data.npz"
        self.isTrainRatingsFileName = outputDir + "is_train.npz"
        
        self.maxIter = maxIter 
        self.trainSplit = 4.0/5 

        self.processRatings()
        #self.processProbe()
        self.splitDataset()        
        self.loadProcessedData()
        
        if self.maxIter != None: 
            logging.debug("Maximum number of iterations: " + str(self.maxIter))
コード例 #28
0
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]
        
        y = y*2 - 1 

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]
        
        testY = testY*2-1

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        numTrees = 5
        minSplit = 50 
        maxDepths = range(3, 10)
        trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347])
        testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801])

        i = 0
        
        #The results are approximately the same, but not exactly 
        for maxDepth in maxDepths:
            treeRankForest = TreeRankForest(self.leafRanklearner)
            treeRankForest.setMaxDepth(maxDepth)
            treeRankForest.setMinSplit(minSplit)
            treeRankForest.setNumTrees(numTrees)
            treeRankForest.learnModel(X, y)
            trainScores = treeRankForest.predict(X)
            testScores = treeRankForest.predict(testX)

            print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY))

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i+=1
コード例 #29
0
    def testPredict2(self):
        #We play around with parameters to maximise AUC on the IGF1_0-Haar data
        dataDir = PathDefaults.getDataDir()
        fileName = dataDir + "IGF1_0-Haar.npy"

        XY = numpy.load(fileName)
        X = XY[:, 0:XY.shape[1]-1]
        y = XY[:, XY.shape[1]-1].ravel()

        weight = numpy.bincount(numpy.array(y, numpy.int))[0]/float(y.shape[0])
        #weight = 0.5
        #weight = 0.9

        folds = 3
        randomForest = RandomForest()
        randomForest.setWeight(weight)
        randomForest.setMaxDepth(50)
        #randomForest.setMinSplit(100)
        mean, var = randomForest.evaluateCv(X, y, folds, Evaluator.auc)
        logging.debug("AUC = " + str(mean))
        logging.debug("Var = " + str(var))
コード例 #30
0
ファイル: BenchmarkExp.py プロジェクト: malcolmreynolds/APGL
def computeLearningRates(datasetNames, numProcesses, fileNameSuffix, learnerName, sampleSizes, foldsSet): 
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/"
    outputDir = PathDefaults.getOutputDir() + "modelPenalisation/"

    learner, loadMethod, dataDir, outputDir, paramDict = getSetup(learnerName, dataDir, outputDir, numProcesses)
    
    for i in range(len(datasetNames)):
        logging.debug("Learning using dataset " + datasetNames[i][0])
        outfileName = outputDir + datasetNames[i][0] + fileNameSuffix

        fileLock = FileLock(outfileName + ".npz")
        if not fileLock.isLocked() and not fileLock.fileExists():
            fileLock.lock()
            
            numRealisations = datasetNames[i][1]  
            gridShape = [numRealisations, sampleSizes.shape[0]]
            gridShape.extend(list(learner.gridShape(paramDict)))   
            gridShape = tuple(gridShape)            
            
            betaGrids = numpy.zeros(gridShape) 
            
            for k in range(sampleSizes.shape[0]):
                sampleSize = sampleSizes[k]
                
                logging.debug("Using sample size " + str(sampleSize))
                for j in range(numRealisations):
                        Util.printIteration(j, 1, numRealisations, "Realisation: ")
                        trainX, trainY, testX, testY = loadMethod(dataDir, datasetNames[i][0], j)
                        
                        numpy.random.seed(21)
                        trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize]
                        validX = trainX[trainInds,:]
                        validY = trainY[trainInds]
                        
                        betaGrids[j, k, :] = learner.learningRate(validX, validY, foldsSet, paramDict)
            
            numpy.savez(outfileName, betaGrids)
            logging.debug("Saved results as file " + outfileName + ".npz")
            fileLock.unlock()
コード例 #31
0
    def testReadFile(self): 
        eCsvReader = EgoCsvReader()
        #logging.debug(os.getcwd())
        dir = PathDefaults.getDataDir()
        fileName = dir + "test/TestData.csv"
        questionIds = [("Q14", 0), ("Q12", 1) , ("Q2", 0)]

        missing = 1
        (X, titles) = eCsvReader.readFile(fileName, questionIds, missing)
        
        X2 = numpy.zeros((10, 3))
        X2[0, :] = [0.621903386,0.608560354,0.33290608]
        X2[1, :] = [0.318548924,0.402390713,0.129956291]
        X2[2, :] = [0.956658404,0.344317772,0.680386616]
        X2[3, :] = [0.267607668,0.119647983,0.116893619]
        X2[4, :] = [0.686589498,0.402390713,0.426789174]
        X2[5, :] = [0.373575769,0.025846789,0.797125005]
        X2[6, :] = [0.493793948,0.402390713,0.990507109]
        X2[7, :] = [0.524534585,0.525169385,0.772917183]
        X2[8, :] = [0.339055395,0.402390713,0.684788001]
        X2[9, :] = [0.997774183,0.790801992,0.643252009]
        
        self.assertAlmostEquals(numpy.linalg.norm(X-X2),0, places=6)
コード例 #32
0
    def testReadGraph(self):

        dir = PathDefaults.getDataDir()
        vertexFileName = dir + "test/deggraf10.csv"
        edgeFileNames = [dir + "test/testEdges1.csv", dir + "test/testEdges2.csv"]

        def genderConv(x):
            genderDict = {'"M"': 0, '"F"': 1}
            return genderDict[x]

        def orientConv(x):
            orientDict = {'"HT"': 0, '"HB"': 1}
            return orientDict[x]

        def fteConv(x):
            fteDict = {'"INTER"': 0, '"CONTA"': 1}
            return fteDict[x]

        def provConv(x):
            provDict = {'"CH"': 0, '"SC"': 1, '"SS"': 2, '"LH"' : 3, '"GM"' : 4}
            return provDict[x]

        converters = {3: genderConv, 4: orientConv, 5:fteConv, 6:provConv}

        idIndex = 0
        featureIndices = list(range(1,11))
        multiGraphCsvReader = MultiGraphCsvReader(idIndex, featureIndices, converters)
        sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames)

        vertexValues = numpy.zeros((10, 10))
        vertexValues[0, :] = numpy.array([1986, 32, 0, 0, 0, 0, 0, 3, 3, 1])
        vertexValues[1, :] = numpy.array([1986, 27, 0, 0, 0, 1, 0, 4, 4, 1])
        vertexValues[2, :] = numpy.array([1986, 20, 0, 0, 0, 1, 0, 1, 1, 0])
        vertexValues[3, :] = numpy.array([1986, 20, 0, 0, 0, 1, 0, 2, 2, 0])
        vertexValues[4, :] = numpy.array([1986, 20, 0, 0, 0, 2, 0, 5, 5, 0])
        vertexValues[5, :] = numpy.array([1986, 28, 0, 0, 0, 3, 0, 1, 1, 1])
        vertexValues[6, :] = numpy.array([1986, 26, 1, 0, 1, 3, 6, 1, 1, 1])
        vertexValues[7, :] = numpy.array([1986, 35, 0, 0, 0, 2, 0, 0, 0, 0])
        vertexValues[8, :] = numpy.array([1986, 37, 0, 1, 0, 3, 0, 5, 3, 0])
        vertexValues[9, :] = numpy.array([1986, 40, 0, 1, 0, 4, 0, 3, 3, 0])

        #Check if the values of the vertices are correct 
        for i in range(sparseMultiGraph.getNumVertices()):
            self.assertTrue((sparseMultiGraph.getVertex(i) == vertexValues[i]).all())

        #Now check edges
        edges = numpy.zeros((10, 3))
        edges[0, :] = numpy.array([4, 0, 0])
        edges[1, :] = numpy.array([4, 1, 0])
        edges[2, :] = numpy.array([5, 3, 0])
        edges[3, :] = numpy.array([7, 1, 0])
        edges[4, :] = numpy.array([8, 0, 0])
        edges[5, :] = numpy.array([4, 1, 1])
        edges[6, :] = numpy.array([8, 1, 1])
        edges[7, :] = numpy.array([8, 2, 1])
        edges[8, :] = numpy.array([8, 4, 1])
        edges[9, :] = numpy.array([9, 0, 1])

        self.assertTrue((sparseMultiGraph.getAllEdges() == edges).all())

        #Now test directed graphs
        sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames, False)

        for i in range(sparseMultiGraph.getNumVertices()):
            self.assertTrue((sparseMultiGraph.getVertex(i) == vertexValues[i]).all())


        edges = numpy.zeros((10, 3))
        edges[0, :] = numpy.array([0, 4, 0])
        edges[1, :] = numpy.array([1, 7, 0])
        edges[2, :] = numpy.array([3, 5, 0])
        edges[3, :] = numpy.array([4, 1, 0])
        edges[4, :] = numpy.array([8, 0, 0])
        edges[5, :] = numpy.array([0, 9, 1])
        edges[6, :] = numpy.array([1, 8, 1])
        edges[7, :] = numpy.array([2, 8, 1])
        edges[8, :] = numpy.array([4, 1, 1])
        edges[9, :] = numpy.array([8, 4, 1])
        
        self.assertTrue((sparseMultiGraph.getAllEdges() == edges).all())
コード例 #33
0
    def testMDLGraphsReader(self):
        reader = MDLGraphsReader()
        dir = PathDefaults.getDataDir()
        fileName = dir + "test/testGraphs.mdl"

        graphs = reader.readFromFile(fileName)
        self.assertEquals(len(graphs), 2)

        #Check the first graph
        self.assertEquals(graphs[0].getNumVertices(), 26)
        self.assertEquals(graphs[0].getNumEdges(), 28)

        def getEdge(graph, i, j):
            return graph.getEdge(i - 1, j - 1)

        self.assertEquals(getEdge(graphs[0], 1, 6), 1)
        self.assertEquals(getEdge(graphs[0], 1, 2), 1)
        self.assertEquals(getEdge(graphs[0], 1, 18), 1)
        self.assertEquals(getEdge(graphs[0], 2, 3), 1)
        self.assertEquals(getEdge(graphs[0], 2, 19), 1)
        self.assertEquals(getEdge(graphs[0], 3, 4), 1)
        self.assertEquals(getEdge(graphs[0], 3, 20), 1)
        self.assertEquals(getEdge(graphs[0], 4, 10), 1)
        self.assertEquals(getEdge(graphs[0], 4, 5), 1)
        self.assertEquals(getEdge(graphs[0], 5, 6), 1)
        self.assertEquals(getEdge(graphs[0], 5, 7), 1)
        self.assertEquals(getEdge(graphs[0], 6, 21), 1)
        self.assertEquals(getEdge(graphs[0], 7, 8), 1)
        self.assertEquals(getEdge(graphs[0], 7, 22), 1)
        self.assertEquals(getEdge(graphs[0], 8, 9), 1)
        self.assertEquals(getEdge(graphs[0], 8, 23), 1)
        self.assertEquals(getEdge(graphs[0], 9, 14), 1)
        self.assertEquals(getEdge(graphs[0], 9, 10), 1)
        self.assertEquals(getEdge(graphs[0], 10, 11), 1)
        self.assertEquals(getEdge(graphs[0], 11, 12), 1)
        self.assertEquals(getEdge(graphs[0], 11, 24), 1)
        self.assertEquals(getEdge(graphs[0], 12, 13), 1)
        self.assertEquals(getEdge(graphs[0], 12, 25), 1)
        self.assertEquals(getEdge(graphs[0], 13, 14), 1)
        self.assertEquals(getEdge(graphs[0], 13, 15), 1)
        self.assertEquals(getEdge(graphs[0], 14, 26), 1)
        self.assertEquals(getEdge(graphs[0], 15, 16), 1)
        self.assertEquals(getEdge(graphs[0], 15, 17), 1)

        #Check the second graph
        self.assertEquals(graphs[1].getNumVertices(), 19)
        self.assertEquals(graphs[1].getNumEdges(), 20)

        self.assertEquals(getEdge(graphs[1], 1, 10), 1)
        self.assertEquals(getEdge(graphs[1], 1, 2), 1)
        self.assertEquals(getEdge(graphs[1], 1, 14), 1)
        self.assertEquals(getEdge(graphs[1], 2, 3), 1)
        self.assertEquals(getEdge(graphs[1], 2, 15), 1)
        self.assertEquals(getEdge(graphs[1], 3, 8), 1)
        self.assertEquals(getEdge(graphs[1], 3, 4), 1)
        self.assertEquals(getEdge(graphs[1], 4, 5), 1)
        self.assertEquals(getEdge(graphs[1], 4, 16), 1)
        self.assertEquals(getEdge(graphs[1], 5, 6), 1)
        self.assertEquals(getEdge(graphs[1], 5, 17), 1)
        self.assertEquals(getEdge(graphs[1], 6, 7), 1)
        self.assertEquals(getEdge(graphs[1], 6, 18), 1)
        self.assertEquals(getEdge(graphs[1], 7, 8), 1)
        self.assertEquals(getEdge(graphs[1], 8, 9), 1)
        self.assertEquals(getEdge(graphs[1], 9, 10), 1)
        self.assertEquals(getEdge(graphs[1], 9, 11), 1)
        self.assertEquals(getEdge(graphs[1], 10, 19), 1)
        self.assertEquals(getEdge(graphs[1], 11, 12), 1)
        self.assertEquals(getEdge(graphs[1], 11, 13), 1)
コード例 #34
0
 def testGetDataDir(self):
     print((PathDefaults.getDataDir()))
コード例 #35
0
    def testReadFromFile(self):
        vertex1Indices = [0, 2, 3, 4, 5]
        vertex2Indices = [1, 6, 7, 8, 9]

        def genderConv(x):
            genderDict = {'"M"': 0, '"F"': 1}
            return genderDict[x]

        def orientConv(x):
            orientDict = {'"HT"': 0, '"HB"': 1}
            return orientDict[x]

        converters = {2: genderConv, 6: genderConv, 3:orientConv, 7:orientConv}

        csvGraphReader = CsvGraphReader(vertex1Indices, vertex2Indices, converters)

        dir = PathDefaults.getDataDir()
        fileName = dir + "test/infect5.csv"

        graph = csvGraphReader.readFromFile(fileName)

        self.assertTrue((graph.getVertex(0) == numpy.array([0, 0, 28, 1])).all())
        self.assertTrue((graph.getVertex(1) == numpy.array([1, 0, 26, 1])).all())
        self.assertTrue((graph.getVertex(2) == numpy.array([0, 1, 42, 2])).all())
        self.assertTrue((graph.getVertex(3) == numpy.array([1, 0, 33, 1])).all())
        self.assertTrue((graph.getVertex(4) == numpy.array([0, 1, 35, 37])).all())

        self.assertTrue(graph.getEdge(0, 1) == 1)
        self.assertTrue(graph.getEdge(2, 3) == 1)
        self.assertTrue(graph.getEdge(4, 6) == 1)
        self.assertTrue(graph.getEdge(6, 7) == 1)
        self.assertTrue(graph.getEdge(5, 8) == 1)

        self.assertEquals(graph.getNumEdges(), 5)
        self.assertTrue(graph.isUndirected())

        #Test a directed graph
        csvGraphReader = CsvGraphReader(vertex1Indices, vertex2Indices, converters, undirected=False)
        graph = csvGraphReader.readFromFile(fileName)

        self.assertTrue(graph.getEdge(1, 0) == None)
        self.assertTrue(graph.getEdge(3, 2) == None)
        self.assertTrue(graph.getEdge(6, 4) == None)
        self.assertTrue(graph.getEdge(7, 6) == None)
        self.assertTrue(graph.getEdge(8, 5) == None)

        self.assertEquals(graph.getNumEdges(), 5)
        self.assertFalse(graph.isUndirected())

        #Test graph with no vertex information
        vertex1Indices = [0]
        vertex2Indices = [1]
        fileName = dir + "test/infect5-0.csv"
        csvGraphReader = CsvGraphReader(vertex1Indices, vertex2Indices, {})
        graph = csvGraphReader.readFromFile(fileName)

        self.assertTrue(graph.getEdge(0, 1) == 1)
        self.assertTrue(graph.getEdge(2, 3) == 1)
        self.assertTrue(graph.getEdge(4, 6) == 1)
        self.assertTrue(graph.getEdge(6, 7) == 1)
        self.assertTrue(graph.getEdge(5, 8) == 1)

        self.assertEquals(graph.getNumEdges(), 5)
        self.assertTrue(graph.isUndirected())
        self.assertEquals(graph.getVertexList().getNumFeatures(), 0)