Python PathDefaults.getOutputDir 예제들, apgl.util.PathDefaults.PathDefaults.getOutputDir Python 예제들

예제 #1

0

파일 보기

파일: ExtractAuthors.py 프로젝트: malcolmreynolds/APGL

def saveRatingMatrix(): 
    """
    Take the coauthor graph above and make vertices indexed from 0 then save 
    as matrix market format. 
    """    
    edgeFileName = PathDefaults.getOutputDir() + "erasm/edges2.txt"
    
    logging.debug("Reading edge list")
    edges = numpy.loadtxt(edgeFileName, delimiter=",", dtype=numpy.int)
    logging.debug("Total number of edges: " + str(edges.shape[0]))
    
    vertexIdDict = {} 
    vertexIdSet = set([])
    
    i = 0 
        
    for edge in edges:
        if edge[0] not in vertexIdSet: 
            vertexIdDict[edge[0]] = i
            vertexIdSet.add(edge[0])
            i += 1 
         
        if edge[1] not in vertexIdSet: 
            vertexIdDict[edge[1]] = i 
            vertexIdSet.add(edge[1])
            i += 1 

    n = len(vertexIdDict)    
    R = scipy.sparse.lil_matrix((n, n))
    logging.debug("Creating sparse matrix")
    
    for edge in edges:
        R[vertexIdDict[edge[0]], vertexIdDict[edge[1]]] += 1 
        R[vertexIdDict[edge[1]], vertexIdDict[edge[0]]] += 1 
        
    logging.debug("Created matrix " + str(R.shape) + " with " + str(R.getnnz()) + " non zeros")    

    R = R.tocsr()    
    
    minCoauthors = 20
    logging.debug("Removing vertices with <" + str(minCoauthors) + " coauthors")
    nonzeros = R.nonzero()    
    inds = numpy.arange(nonzeros[0].shape[0])[numpy.bincount(nonzeros[0]) >= minCoauthors]
    R = R[inds, :][:, inds]
    logging.debug("Matrix has shape " + str(R.shape) + " with " + str(R.getnnz()) + " non zeros")    
        
    matrixFileName = PathDefaults.getOutputDir() + "erasm/R"
    scipy.io.mmwrite(matrixFileName, R)
    logging.debug("Wrote matrix to file " + matrixFileName)

예제 #2

0

파일 보기

파일: MendeleyGroupsDataset.py 프로젝트: pierrebo/wallhack

    def __init__(self, maxIter=None, iterStartTimeStamp=None): 
        outputDir = PathDefaults.getOutputDir() + "recommend/erasm/"

        if not os.path.exists(outputDir): 
            os.mkdir(outputDir)
            
        #iterStartDate is the starting date of the iterator 
        if iterStartTimeStamp != None: 
            self.iterStartTimeStamp = iterStartTimeStamp
        else: 
            self.iterStartTimeStamp = 1286229600
            
        self.timeStep = timedelta(30).total_seconds()             
                
        self.ratingFileName = outputDir + "data.npz"          
        self.userDictFileName = outputDir + "userIdDict.pkl"   
        self.groupDictFileName = outputDir + "groupIdDict.pkl" 
        self.isTrainRatingsFileName = outputDir + "is_train.npz"
    
        self.dataDir = PathDefaults.getDataDir() + "erasm/"
        self.dataFileName = self.dataDir + "groupMembers-29-11-12" 
        
        self.maxIter = maxIter 
        self.trainSplit = 4.0/5 
        
        self.processRatings()
        self.splitDataset()        
        self.loadProcessedData()

예제 #3

0

파일 보기

파일: ContactsDataset.py 프로젝트: pierrebo/wallhack

    def __init__(self, maxIter=None, iterStartTimeStamp=None):
        """
        Return a training and test set for movielens based on the time each 
        rating was made. 
        """
        self.timeStep = timedelta(30).total_seconds()

        # iterStartDate is the starting date of the iterator
        if iterStartTimeStamp != None:
            self.iterStartTimeStamp = iterStartTimeStamp
        else:
            self.iterStartTimeStamp = 789652009

        outputDir = PathDefaults.getOutputDir() + "recommend/erasm/"

        self.numRatings = 402872
        self.minContacts = 10

        if not os.path.exists(outputDir):
            os.mkdir(outputDir)

        self.ratingFileName = outputDir + "data.npz"
        self.userDictFileName = outputDir + "userIdDict.pkl"
        self.isTrainRatingsFileName = outputDir + "is_train.npz"

        self.maxIter = maxIter
        self.trainSplit = 4.0 / 5

        self.processRatings()
        self.splitDataset()
        self.loadProcessedData()

        if self.maxIter != None:
            logging.debug("Maximum number of iterations: " + str(self.maxIter))

예제 #4

0

파일 보기

파일: PajekWriterTest.py 프로젝트: pombredanne/APGL

    def testWriteToFile3(self):
        """
        We will test out writing out some random graphs to Pajek
        """
        numVertices = 20
        numFeatures = 0
        vList = VertexList(numVertices, numFeatures)
        graph = SparseGraph(vList)

        p = 0.1
        generator = ErdosRenyiGenerator(p)
        graph = generator.generate(graph)

        pw = PajekWriter()
        directory = PathDefaults.getOutputDir() + "test/"
        pw.writeToFile(directory + "erdosRenyi20", graph)

        #Now write a small world graph
        p = 0.2
        k = 3

        graph.removeAllEdges()
        generator = SmallWorldGenerator(p, k)
        graph = generator.generate(graph)

        pw.writeToFile(directory + "smallWorld20", graph)

예제 #5

0

파일 보기

파일: RecommendExpHelper.py 프로젝트: pierrebo/wallhack

    def __init__(self, trainXIteratorFunc, testXIteratorFunc, cmdLine=None, defaultAlgoArgs = None, dirName=""):
        """ priority for default args
         - best priority: command-line value
         - middle priority: set-by-function value
         - lower priority: class value
        """
        # Parameters to choose which methods to run
        # Obtained merging default parameters from the class with those from the user
        self.algoArgs = RecommendExpHelper.newAlgoParams(defaultAlgoArgs)
        
        #Function to return iterators to the training and test matrices  
        self.trainXIteratorFunc = trainXIteratorFunc
        self.testXIteratorFunc = testXIteratorFunc
        
        #How often to print output 
        self.logStep = 10
        
        #The max number of observations to use for model selection
        self.sampleSize = 5*10**6

        # basic resultsDir
        self.resultsDir = PathDefaults.getOutputDir() + "recommend/" + dirName + "/"

        # update algoParams from command line
        self.readAlgoParams(cmdLine)

예제 #6

0

파일 보기

파일: PajekWriterTest.py 프로젝트: charanpald/APGL

    def testWriteToFile3(self):
        """
        We will test out writing out some random graphs to Pajek
        """
        numVertices = 20
        numFeatures = 0 
        vList = VertexList(numVertices, numFeatures)
        graph = SparseGraph(vList)

        p = 0.1
        generator = ErdosRenyiGenerator(p)
        graph = generator.generate(graph)

        pw = PajekWriter()
        directory = PathDefaults.getOutputDir() + "test/"
        pw.writeToFile(directory + "erdosRenyi20", graph)

        #Now write a small world graph
        p = 0.2
        k = 3

        graph.removeAllEdges()
        generator = SmallWorldGenerator(p, k)
        graph = generator.generate(graph)

        pw.writeToFile(directory + "smallWorld20", graph)

예제 #7

0

파일 보기

파일: MetabolomicsExp.py 프로젝트: malcolmreynolds/APGL

    def __init__(self, YList, X, featuresName, ages, args):
        super(MetabolomicsExpRunner, self).__init__(args=args)
        self.X = X
        self.YList = YList #The list of concentrations 
        self.featuresName = featuresName
        self.args = args
        self.ages = ages 

        self.maxDepth = 10
        self.numTrees = 10
        self.sampleSize = 1.0
        self.sampleReplace = True
        self.folds = 5
        self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/"

        self.leafRankGenerators = []
        self.leafRankGenerators.append((LinearSvmGS.generate(), "SVM"))
        self.leafRankGenerators.append((SvcGS.generate(), "RBF-SVM"))
        self.leafRankGenerators.append((DecisionTree.generate(), "CART"))

        self.pcaLeafRankGenerators = [(LinearSvmPca.generate(), "LinearSVM-PCA")]

        self.funcLeafRankGenerators = []
        self.funcLeafRankGenerators.append((LinearSvmFGs.generate, "SVMF"))
        self.funcLeafRankGenerators.append((SvcFGs.generate, "RBF-SVMF"))
        self.funcLeafRankGenerators.append((DecisionTreeF.generate, "CARTF"))

        #Store all the label vectors and their missing values
        YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList)
        self.hormoneInds = [YIgf1Inds, YICortisolInds, YTestoInds]
        self.hormoneNames = MetabolomicsUtils.getLabelNames()

예제 #8

0

파일 보기

파일: SimpleGraphWriterTest.py 프로젝트: pombredanne/APGL

    def testWriteToFile(self):
        sgw = SimpleGraphWriter()
        directory = PathDefaults.getOutputDir() + "test/"

        #Have to check the files
        fileName1 = directory + "dictTestUndirected"
        sgw.writeToFile(fileName1, self.dctGraph1)

        fileName2 = directory + "dictTestDirected"
        sgw.writeToFile(fileName2, self.dctGraph2)

예제 #9

0

파일 보기

파일: SvmInfoExperiment.py 프로젝트: malcolmreynolds/APGL

    def getOutputFileName(graphType, p, k, infoProb):
        outputDirectory = PathDefaults.getOutputDir()

        if graphType == "SmallWorld":
            outputFileName = outputDirectory + "SvmEgoOutput_type=" + graphType + "_p=" + str(p) + "_k=" + str(k) + "_q=" + str(infoProb)
        elif graphType == "ErdosRenyi":
            outputFileName = outputDirectory + "SvmEgoOutput_type=" + graphType + "_p=" + str(p) + "_q=" + str(infoProb)
        else:
            raise ValueError("Invalid graph type: " + graphType)

        return outputFileName

예제 #10

0

파일 보기

파일: MetabolomicsRegExp.py 프로젝트: malcolmreynolds/APGL

    def __init__(self, df, X, featuresName, ages, args):
        super(MetabolomicsRegExpRunner, self).__init__(args=args)
        self.df = df
        self.X = X
        self.featuresName = featuresName
        self.args = args
        self.ages = ages 

        self.labelNames = MetabolomicsUtils.getLabelNames()
        self.YList = MetabolomicsUtils.createLabelList(df, self.labelNames)
        self.boundsList = MetabolomicsUtils.getBounds()

        self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/"

예제 #11

0

파일 보기

파일: ProcessResults.py 프로젝트: pierrebo/wallhack

def loadParams(ind): 
    if processReal: 
        resultsDir = PathDefaults.getOutputDir() + "viroscopy/real/theta" + str(ind) + "/"
        outputDir = resultsDir + "stats/"
        
        N, matchAlpha, breakScale, numEpsilons, epsilon, minEpsilon, matchAlg, abcMaxRuns, batchSize, pertScale = HIVModelUtils.realABCParams(True)
        startDate, endDate, recordStep, M, targetGraph, numInds = HIVModelUtils.realSimulationParams(test=True, ind=ind)
        realTheta, sigmaTheta, pertTheta = HIVModelUtils.estimatedRealTheta(ind)
        numInds=2
        prefix = "Real"
    else: 
        resultsDir = PathDefaults.getOutputDir() + "viroscopy/toy/theta/"
        outputDir = resultsDir + "stats/"        
        
        N, matchAlpha, breakScale, numEpsilons, epsilon, minEpsilon, matchAlg, abcMaxRuns, batchSize, pertScale = HIVModelUtils.toyABCParams()
        startDate, endDate, recordStep, M, targetGraph = HIVModelUtils.toySimulationParams(test=True)
        realTheta, sigmaTheta, pertTheta = HIVModelUtils.toyTheta()
        prefix = "Toy"
        numInds = 1

    breakSize = (targetGraph.subgraph(targetGraph.removedIndsAt(endDate)).size - targetGraph.subgraph(targetGraph.removedIndsAt(startDate)).size)  * breakScale       
        
    return N, resultsDir, outputDir, recordStep, startDate, endDate, prefix, targetGraph, breakSize, numEpsilons, M, matchAlpha, matchAlg, numInds

예제 #12

0

파일 보기

    def testWriteToFile(self):
        graph = DictGraph()

        numVertices = 5
        numFeatures = 3

        V = numpy.random.rand(numVertices, numFeatures)

        for i in range(0, numVertices):
            graph.setVertex(i, V[i, :])

        fileName = PathDefaults.getOutputDir() + "test/vertices"
        verterWriter = CsvVertexWriter()
        verterWriter.writeToFile(fileName, graph)

        logging.debug(V)

예제 #13

0

파일 보기

파일: CsvVertexWriterTest.py 프로젝트: charanpald/APGL

    def testWriteToFile(self):
        graph = DictGraph()

        numVertices = 5
        numFeatures = 3

        V = numpy.random.rand(numVertices, numFeatures)

        for i in range(0, numVertices):
            graph.setVertex(i, V[i, :])

        fileName = PathDefaults.getOutputDir() + "test/vertices"
        verterWriter = CsvVertexWriter()
        verterWriter.writeToFile(fileName, graph)

        logging.debug(V)

예제 #14

0

파일 보기

파일: ClusterExpHelper.py 프로젝트: malcolmreynolds/APGL

    def __init__(self, iteratorFunc, cmdLine=None, defaultAlgoArgs = None, dirName=""):
        # Parameters to choose which methods to run
        # Obtained merging default parameters from the class with those from the user
        self.algoArgs = ClusterExpHelper.newAlgoParams(defaultAlgoArgs)
        
        # Variables related to the dataset
        self.getIteratorFunc = iteratorFunc
        
        #How often to print output 
        self.logStep = 10

        # basic resultsDir
        self.resultsDir = PathDefaults.getOutputDir() + "cluster/" + dirName + "/"

        # update algoParams from command line
        self.readAlgoParams(cmdLine)

예제 #15

0

파일 보기

파일: NetflixDataset.py 프로젝트: pierrebo/wallhack

    def __init__(self, maxIter=None, iterStartTimeStamp=None): 
        """
        Return a training and test set for netflix based on the time each 
        rating was made. There are 62 iterations. 
        """ 
        self.timeStep = timedelta(30).total_seconds()  
        
        #startDate is used to convert dates into ints 
        #self.startDate = datetime(1998,1,1)
        #self.endDate = datetime(2005,12,31)
        
        #iterStartDate is the starting date of the iterator 
        if iterStartTimeStamp != None: 
            self.iterStartTimeStamp = iterStartTimeStamp
        else: 
            self.iterStartTimeStamp = time.mktime(datetime(2001,1,1).timetuple()) 

        self.startMovieID = 1 
        self.endMovieID = 17770
        
        self.numMovies = 17770
        self.numRatings = 100480507
        self.numProbeMovies = 16938
        self.numProbeRatings = 1408395
        self.numCustomers = 480189
        
        outputDir = PathDefaults.getOutputDir() + "recommend/netflix/"

        if not os.path.exists(outputDir): 
            os.mkdir(outputDir)
                
        self.ratingFileName = outputDir + "data.npz"  
        self.custDictFileName = outputDir + "custIdDict.pkl"
        self.probeFileName = PathDefaults.getDataDir() + "netflix/probe.txt"    
        self.testRatingsFileName = outputDir + "test_data.npz"
        self.isTrainRatingsFileName = outputDir + "is_train.npz"
        
        self.maxIter = maxIter 
        self.trainSplit = 4.0/5 

        self.processRatings()
        #self.processProbe()
        self.splitDataset()        
        self.loadProcessedData()
        
        if self.maxIter != None: 
            logging.debug("Maximum number of iterations: " + str(self.maxIter))

예제 #16

0

파일 보기

파일: RecommendExp.py 프로젝트: malcolmreynolds/APGL

def recommend(learner): 
    """
    Take a list of coauthors and read in the complete graph into a sparse 
    matrix X such that X_ij = k means author i has worked with j, k times. Then 
    do matrix factorisation on the resulting methods. 
    """
    outputDir = PathDefaults.getOutputDir() + "erasm/" 
    matrixFileName = outputDir + "Toy"
    
    numExamples = 50 
    numFolds = 5    
      
    X = scipy.io.mmread(matrixFileName)
    X = scipy.sparse.csr_matrix(X)
    logging.debug("Loaded matrix " + str(X.shape) + " with " + str(X.getnnz()) + " non zeros")
    X = X.tocsr()
    X = X[0:numExamples ,:]
    X, maxS = preprocess(X)

    #Take out some ratings to form a training set
    rowInds, colInds = X.nonzero()
    randInds = numpy.random.permutation(rowInds.shape[0])
    indexList = Sampling.crossValidation(numFolds, rowInds.shape[0])
    
    paramList = [] 
    for j, (trnIdx, tstIdx) in enumerate(indexList): 
        trainInds = randInds[trnIdx]
        testInds = randInds[tstIdx]
        
        trainX = SparseUtils.selectMatrix(X, rowInds[trainInds], colInds[trainInds]).tocsr()
        testX = SparseUtils.selectMatrix(X, rowInds[testInds], colInds[testInds]).tocsr()
        
        paramList.append((trainX, testX, learner))
        
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    results = pool.map(computeTestError, paramList)
    #results = map(computeTestError, paramList)
    
    testErrors = numpy.array(results)
    meanTestErrors = testErrors.mean()
    logging.debug("Test errors = " + str(meanTestErrors))
    
    errorFileName = outputDir + "results_" + learner.name()
    numpy.savez(errorFileName, meanTestErrors)   
    logging.debug("Saved results as " + errorFileName)

예제 #17

0

파일 보기

파일: BenchmarkExp.py 프로젝트: malcolmreynolds/APGL

def computeLearningRates(datasetNames, numProcesses, fileNameSuffix, learnerName, sampleSizes, foldsSet): 
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/"
    outputDir = PathDefaults.getOutputDir() + "modelPenalisation/"

    learner, loadMethod, dataDir, outputDir, paramDict = getSetup(learnerName, dataDir, outputDir, numProcesses)
    
    for i in range(len(datasetNames)):
        logging.debug("Learning using dataset " + datasetNames[i][0])
        outfileName = outputDir + datasetNames[i][0] + fileNameSuffix

        fileLock = FileLock(outfileName + ".npz")
        if not fileLock.isLocked() and not fileLock.fileExists():
            fileLock.lock()
            
            numRealisations = datasetNames[i][1]  
            gridShape = [numRealisations, sampleSizes.shape[0]]
            gridShape.extend(list(learner.gridShape(paramDict)))   
            gridShape = tuple(gridShape)            
            
            betaGrids = numpy.zeros(gridShape) 
            
            for k in range(sampleSizes.shape[0]):
                sampleSize = sampleSizes[k]
                
                logging.debug("Using sample size " + str(sampleSize))
                for j in range(numRealisations):
                        Util.printIteration(j, 1, numRealisations, "Realisation: ")
                        trainX, trainY, testX, testY = loadMethod(dataDir, datasetNames[i][0], j)
                        
                        numpy.random.seed(21)
                        trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize]
                        validX = trainX[trainInds,:]
                        validY = trainY[trainInds]
                        
                        betaGrids[j, k, :] = learner.learningRate(validX, validY, foldsSet, paramDict)
            
            numpy.savez(outfileName, betaGrids)
            logging.debug("Saved results as file " + outfileName + ".npz")
            fileLock.unlock()

예제 #18

0

파일 보기

파일: PajekWriterTest.py 프로젝트: charanpald/APGL

    def testWriteToFile(self):
        pw = PajekWriter()
        directory = PathDefaults.getOutputDir() + "test/"
        
        #Have to check the files
        fileName1 = directory + "denseTestUndirected"
        pw.writeToFile(fileName1, self.dGraph1)
        
        fileName2 = directory + "denseTestDirected"
        pw.writeToFile(fileName2, self.dGraph2)
        
        fileName3 = directory + "sparseTestUndirected"
        pw.writeToFile(fileName3, self.sGraph1)
        
        fileName4 = directory + "sparseTestDirected"
        pw.writeToFile(fileName4, self.sGraph2)

        fileName5 = directory + "dictTestUndirected"
        pw.writeToFile(fileName5, self.dctGraph1)

        fileName6 = directory + "dictTestDirected"
        pw.writeToFile(fileName6, self.dctGraph2)

예제 #19

0

파일 보기

파일: PajekWriterTest.py 프로젝트: pombredanne/APGL

    def testWriteToFile(self):
        pw = PajekWriter()
        directory = PathDefaults.getOutputDir() + "test/"

        #Have to check the files
        fileName1 = directory + "denseTestUndirected"
        pw.writeToFile(fileName1, self.dGraph1)

        fileName2 = directory + "denseTestDirected"
        pw.writeToFile(fileName2, self.dGraph2)

        fileName3 = directory + "sparseTestUndirected"
        pw.writeToFile(fileName3, self.sGraph1)

        fileName4 = directory + "sparseTestDirected"
        pw.writeToFile(fileName4, self.sGraph2)

        fileName5 = directory + "dictTestUndirected"
        pw.writeToFile(fileName5, self.dctGraph1)

        fileName6 = directory + "dictTestDirected"
        pw.writeToFile(fileName6, self.dctGraph2)

예제 #20

0

파일 보기

파일: PajekWriterTest.py 프로젝트: pombredanne/APGL

    def testWriteToFile2(self):
        pw = PajekWriter()
        directory = PathDefaults.getOutputDir() + "test/"

        def setVertexColour(vertexIndex, graph):
            colours = ["grey05", "grey10", "grey15", "grey20", "grey25"]
            return colours[vertexIndex]

        def setVertexSize(vertexIndex, graph):
            return vertexIndex

        def setEdgeColour(vertexIndex1, vertexIndex2, graph):
            colours = ["grey05", "grey10", "grey15", "grey20", "grey25"]
            return colours[vertexIndex1]

        def setEdgeSize(vertexIndex1, vertexIndex2, graph):
            return vertexIndex1 + vertexIndex2

        pw.setVertexColourFunction(setVertexColour)
        fileName1 = directory + "vertexColourTest"
        pw.writeToFile(fileName1, self.dGraph1)
        pw.setVertexColourFunction(None)

        pw.setVertexSizeFunction(setVertexSize)
        fileName1 = directory + "vertexSizeTest"
        pw.writeToFile(fileName1, self.dGraph1)
        pw.setVertexSizeFunction(None)

        pw.setEdgeColourFunction(setEdgeColour)
        fileName1 = directory + "edgeColourTest"
        pw.writeToFile(fileName1, self.dGraph1)
        pw.setEdgeColourFunction(None)

        pw.setEdgeSizeFunction(setEdgeSize)
        fileName1 = directory + "edgeSizeTest"
        pw.writeToFile(fileName1, self.dGraph1)
        pw.setEdgeColourFunction(None)

예제 #21

0

파일 보기

파일: PajekWriterTest.py 프로젝트: charanpald/APGL

    def testWriteToFile2(self):
        pw = PajekWriter()
        directory = PathDefaults.getOutputDir() + "test/"

        def setVertexColour(vertexIndex, graph):
            colours = ["grey05", "grey10", "grey15", "grey20", "grey25"]
            return colours[vertexIndex]

        def setVertexSize(vertexIndex, graph):
            return vertexIndex

        def setEdgeColour(vertexIndex1, vertexIndex2, graph):
            colours = ["grey05", "grey10", "grey15", "grey20", "grey25"]
            return colours[vertexIndex1]

        def setEdgeSize(vertexIndex1, vertexIndex2, graph):
            return vertexIndex1+vertexIndex2

        pw.setVertexColourFunction(setVertexColour)
        fileName1 = directory + "vertexColourTest"
        pw.writeToFile(fileName1, self.dGraph1)
        pw.setVertexColourFunction(None)

        pw.setVertexSizeFunction(setVertexSize)
        fileName1 = directory + "vertexSizeTest"
        pw.writeToFile(fileName1, self.dGraph1)
        pw.setVertexSizeFunction(None)

        pw.setEdgeColourFunction(setEdgeColour)
        fileName1 = directory + "edgeColourTest"
        pw.writeToFile(fileName1, self.dGraph1)
        pw.setEdgeColourFunction(None)

        pw.setEdgeSizeFunction(setEdgeSize)
        fileName1 = directory + "edgeSizeTest"
        pw.writeToFile(fileName1, self.dGraph1)
        pw.setEdgeColourFunction(None)

예제 #22

0

파일 보기

파일: FlixsterDataset.py 프로젝트: pierrebo/wallhack

    def __init__(self, maxIter=None, iterStartTimeStamp=None): 
        """
        Return a training and test set for itemlens based on the time each 
        rating was made. 
        """ 
        self.timeStep = timedelta(30).total_seconds() 
        
        #iterStartDate is the starting date of the iterator 
        if iterStartTimeStamp != None: 
            self.iterStartTimeStamp = iterStartTimeStamp
        else: 
            self.iterStartTimeStamp = time.mktime(datetime(2009,1,1).timetuple())
         
        self.numItems = 1560144
        #It says 13668319 on the site but that seems to be wrong 
        self.numRatings = 8196072
        self.numCustomers = 71567
        
        outputDir = PathDefaults.getOutputDir() + "recommend/Flixster/"

        if not os.path.exists(outputDir): 
            os.mkdir(outputDir)
                
        self.ratingFileName = outputDir + "data.npz"  
        self.custDictFileName = outputDir + "custIdDict.pkl"   
        self.itemDictFileName = outputDir + "itemIdDict.pkl" 
        self.isTrainRatingsFileName = outputDir + "is_train.npz"
        
        self.maxIter = maxIter 
        self.trainSplit = 4.0/5 

        self.processRatings()
        self.splitDataset()        
        self.loadProcessedData()
        
        if self.maxIter != None: 
            logging.debug("Maximum number of iterations: " + str(self.maxIter))

예제 #23

0

파일 보기

파일: ProcessResults.py 프로젝트: pierrebo/wallhack

            plt.xlabel("log(t)")
            plt.ylabel('Error')
            plt.legend(loc="lower left")
    plt.show()  


showCART = True  
showSVR = False 

from itertools import cycle
lines = ["k-","k--","k-.","k:","k-x", "k-+"]
linecycler = cycle(lines)


if showSVR: 
    outputDir = PathDefaults.getOutputDir() + "modelPenalisation/regression/SVR/"
    
    sampleSizes = numpy.array([50, 100, 200])
    sampleMethods = ["CV"]
    cvScalings = numpy.arange(0.6, 1.61, 0.2)
    foldsSet = numpy.arange(2, 13, 2)
    datasetNames = ModelSelectUtils.getRegressionDatasets()
    fileNameSuffix = 'Results'
    summary(datasetNames, sampleSizes, foldsSet, cvScalings, sampleMethods, fileNameSuffix)
    
    plotDatasetNames = [datasetNames[7]]
    plotAlphas(plotDatasetNames, sampleSizes, foldsSet, cvScalings, sampleMethods, fileNameSuffix)    
    
    sampleSizes = numpy.array([25, 50, 100])
    sampleMethods = ["CV"]
    cvScalings = numpy.arange(0.6, 1.61, 0.2)

예제 #24

0

파일 보기

파일: ModelRealExp.py 프로젝트: pierrebo/wallhack

else: 
    numProcesses = multiprocessing.cpu_count()

if len(sys.argv) > 2:
    i = int(sys.argv[2])
else: 
    i = 0 

FORMAT = "%(levelname)s:root:%(process)d:%(message)s"
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format=FORMAT)
logging.debug("Number of processes: " + str(numProcesses))
logging.debug("Epidemic period index " + str(i))
numpy.set_printoptions(suppress=True, precision=4, linewidth=150)
numpy.seterr(invalid='raise')

resultsDir = PathDefaults.getOutputDir() + "viroscopy/real/" 
startDate, endDate, recordStep, M, targetGraph, numInds = HIVModelUtils.realSimulationParams(ind=i)
N, matchAlpha, breakScale, numEpsilons, epsilon, minEpsilon, matchAlg, abcMaxRuns, batchSize, pertScale = HIVModelUtils.realABCParams(i)

logging.debug("Posterior sample size " + str(N))
logging.debug("Matching algorithm " + str(matchAlg))

logging.debug("="*10 + "Starting new simulation batch with index " + str(i) + "="*10) 
logging.debug("Total time of simulation is " + str(endDate-startDate))    

breakSize = (targetGraph.subgraph(targetGraph.removedIndsAt(endDate)).size - targetGraph.subgraph(targetGraph.removedIndsAt(startDate)).size)  * breakScale
logging.debug("Largest acceptable graph is " + str(breakSize))

def createModel(t):
    """
    The parameter t is the particle index.

예제 #25

0

파일 보기

파일: LaplacianExp2.py 프로젝트: pierrebo/wallhack

k = 4
numGraphs = 100 
#numGraphs = 20
nystromNs = [900]
randSVDVecs = [100, 900]
IASCL = [k, 300] # more than k is mostly useless (except l=graphSize): a priori, all the remaining directions are equivalent for the noise. So to catch changes implied by noise we have to keep all the directions.
numClusterVertices = 250
numMethods = len(nystromNs) + len(randSVDVecs) + len(IASCL) + 3
errors = numpy.zeros((numGraphs, numMethods)) 

numRepetitions = 20 
#numRepetitions = 1

saveResults = False
resultsDir = PathDefaults.getOutputDir() + "cluster/"
fileName = resultsDir + "ErrorBoundNystrom.npy"

if saveResults: 
    for r in range(numRepetitions): 
        i = 0 
        iterator = BoundGraphIterator(changeEdges=50, numGraphs=numGraphs, numClusterVertices=numClusterVertices, numClusters=k, p=0.1)
        
        for W in iterator: 
            print("i="+str(i))
            L = GraphUtils.shiftLaplacian(W)
          
            if i == 0: 
                initialL = L
                initialOmega, initialQ = numpy.linalg.eigh(L.todense())
                inds = numpy.flipud(numpy.argsort(initialOmega))

예제 #26

0

파일 보기

파일: ProcessResults.py 프로젝트: pierrebo/wallhack

from matplotlib import rc
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
rc('text', usetex=True)
from apgl.util.PathDefaults import PathDefaults 



logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

#For now just print some results for a particular dataset 
#dataset = "MovieLensDataset"
dataset = "NetflixDataset"
#dataset = "FlixsterDataset"
#dataset = "SyntheticDataset1"
#dataset = "EpinionsDataset"
outputDir = PathDefaults.getOutputDir() + "recommend/" + dataset + "/"

plotStyles = ['k-', 'k--', 'k-.', 'r--', 'r-', 'g-', 'b-', 'b--', 'b-.', 'g--', 'g--', 'g-.', 'r-', 'r--', 'r-.']
methods = ["propack", "arpack", "rsvd", "rsvdUpdate2"]
updateAlgs = ["initial", "zero"]

#pq = [(10, 2), (50, 2), (10, 5)]
pq = [(10, 3), (50, 2), (50, 3)]
#fileNames = [outputDir + "ResultsSgdMf.npz"]
#labels = ["SgdMf"]
fileNames = []
labels = []

consise = True

for method in methods:

예제 #27

0

파일 보기

파일: InfectGrowthStatistics.py 프로젝트: malcolmreynolds/APGL

from apgl.viroscopy.HIVGraphReader import HIVGraphReader

"""
This script computes some basic statistics on the growing infection graph.
"""

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
numpy.set_printoptions(suppress=True, linewidth=100, precision=3)

undirected = False 
hivReader = HIVGraphReader()
graph = hivReader.readHIVGraph(undirected, indicators=False)
fInds = hivReader.getNonIndicatorFeatureIndices()


figureDir = PathDefaults.getOutputDir() + "viroscopy/figures/infect/"
resultsDir = PathDefaults.getOutputDir() + "viroscopy/"

#The set of edges indexed by zeros is the contact graph
#The ones indexed by 1 is the infection graph
edgeTypeIndex1 = 0
edgeTypeIndex2 = 1
sGraphContact = graph.getSparseGraph(edgeTypeIndex1)
sGraphInfect = graph.getSparseGraph(edgeTypeIndex2)

sGraph = sGraphInfect
#sGraph = sGraph.subgraph(range(0, 500))

graphStats = GraphStatistics()
statsArray = graphStats.scalarStatistics(sGraph, False)
slowStats = True

예제 #28

0

파일 보기

파일: DatasetStats.py 프로젝트: malcolmreynolds/APGL

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

plotHIV = False 
plotCitation = False
plotBemol = True 

saveResults = False 
findEigs = False

if plotHIV: 
    def getIterator(): 
        generator = HIVIterGenerator()
        return generator.getIterator()
        
    resultsDir = PathDefaults.getOutputDir() + "cluster/HIV/Stats/"
    
if plotCitation: 
    
    def getIterator(): 
        maxGraphSize = None 
        generator = CitationIterGenerator(maxGraphSize=maxGraphSize)
        return generator.getIterator()
    
    resultsDir = PathDefaults.getOutputDir() + "cluster/Citation/Stats/"
if plotBemol: 
    def getIterator(): 
        dataDir = PathDefaults.getDataDir() + "cluster/"
        
        nbUser = 10000 # set to 'None' to have all users
        nbPurchasesPerIt = 500 # set to 'None' to take all the purchases per date

예제 #29

0

파일 보기

파일: ExtractAuthors.py 프로젝트: malcolmreynolds/APGL

def saveAuthors(): 
    
    path = "/local/dhanjalc/dataDump-28-11-12/" 
    fileName = path + "articleMetadata500000"    
    
    if not os.path.exists(fileName): 
        path = PathDefaults.getDataDir() + "erasm/"

    
    fileName = path + "articleMetadata1000000" 
    
    logging.debug("Loading article metadata from " + fileName)
    
    fileObj = open(fileName, 'r')
    vertexIdDict = {} 
    vertexIdSet = set([])
    vertexIdList = []
    edgeSet = set([])
    edgeArray = []
    
    i = 0 
    lineInd = 0 
    emptyAuthors = 0
    
    edgeFileName = PathDefaults.getOutputDir() + "edges.txt"
    edgesFile = open(edgeFileName, "w")
    lineBuffer = ""
    
    for line in fileObj:    
        if lineInd % 1000 == 0: 
            print("Line " + str(lineInd) + " Author " + str(len(vertexIdSet)) + " empty author strings " + str(emptyAuthors)) 
            if len(lineBuffer) != 0:
                edgesFile.write(lineBuffer)
            lineBuffer = ""
        
        articleMetaData = json.loads(line)
        
        if "authors" in articleMetaData: 
            authors = articleMetaData["authors"]
            del articleMetaData
            
            coauthorList = []
            for author in authors: 
                authorString = "".join([author["forename"], " ", author["surname"]])
                authorString = authorString.strip()         
                
                if len(authorString) != 0: 
                    if authorString not in vertexIdSet: 
                        vertexIdDict[authorString] = len(vertexIdSet)
                        vertexIdSet.add(authorString)
                    
                    coauthorList.append(authorString)
                                    
                    del authorString 
                else: 
                    emptyAuthors += 1
                
            iterator = itertools.combinations(coauthorList, 2)
            del coauthorList 
            
            for vId1, vId2 in iterator:         
                #Note that we will have duplicate edges 
                lineBuffer += str(vertexIdDict[vId1]) + ", " + str(vertexIdDict[vId2]) + "\n"
    
        lineInd += 1 
    
    edgesFile.close()
    
    print(sys.getsizeof(vertexIdDict))
    print(sys.getsizeof(vertexIdSet))
    print(sys.getsizeof(vertexIdList))
    print(sys.getsizeof(edgeSet))
    print(sys.getsizeof(edgeArray))
    
    logging.debug("Saved edges as " + edgeFileName)

예제 #30

0

파일 보기

파일: HIVEpidemicStatisticsReal.py 프로젝트: malcolmreynolds/APGL

        featureInds = numpy.arange(featureInds.shape[0])[featureInds]        
        
        matcher = GraphMatch("PATH", alpha=0.5, featureInds=featureInds, useWeightM=False)
        graphMetrics = HIVGraphMetrics2(targetGraph, 1.0, matcher, float(endDate))        
        
        times, infectedIndices, removedIndices, graph = HIVModelUtils.simulate(thetaArray[i], startDate, endDate, recordStep, M, graphMetrics)
        times, vertexArray, removedGraphStats = HIVModelUtils.generateStatistics(graph, startDate, endDate, recordStep)
    
        stats = times, vertexArray, removedGraphStats, graphMetrics.dists, graphMetrics.graphDists, graphMetrics.labelDists
        
        
        Util.savePickle(stats, resultsFileName)

if saveResults:
    for j, endDate in enumerate(endDates): 
        resultsDir = PathDefaults.getOutputDir() + "viroscopy/real/theta" + str(j) + "/"
        outputDir = resultsDir + "stats/"
        
        logging.debug(resultsDir)
        newNumRecordSteps = numRecordSteps + 5         
        endDate += HIVModelUtils.realTestPeriods[j]
        recordStep = (endDate-startDate)/float(newNumRecordSteps)
        
        for i in range(maxT): 
            thetaArray, distArray = loadThetaArray(N, resultsDir, i)
            if thetaArray.shape[0] == N: 
                t = i       
        
        thetaArray = loadThetaArray(N, resultsDir, t)[0]
        logging.debug(thetaArray)

예제 #31

0

파일 보기

파일: PathDefaultsTest.py 프로젝트: awj223/Insight-Data-Engineering-Code-Challenge

 def testGetOutputDir(self):
     print((PathDefaults.getOutputDir()))

예제 #32

0

파일 보기

파일: GenerateToyData.py 프로젝트: malcolmreynolds/APGL

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

numExamples = 500
rank = 50 
 
A = numpy.random.rand(numExamples, numExamples)
A = A.dot(A.T)
s, U = numpy.linalg.eig(A)
U = U[:, 0:rank] 
#Make sure result is non-negative by taking the absolute value of single vectors 
U = numpy.abs(U)

B = numpy.random.rand(numExamples, numExamples)
B = B.dot(B.T)
s, V = numpy.linalg.eig(B)
V = V[:, 0:rank]
V = numpy.abs(V)

s = numpy.random.rand(rank)

X = (U*s).dot(V.T)

#Save matrix 
outputDir = PathDefaults.getOutputDir() + "erasm/"
fileName = outputDir + "Toy" 
scipy.io.mmwrite(fileName, X)

logging.debug("Saved to file " + fileName + ".mtx")

예제 #33

0

파일 보기

파일: MetabolomicsExpHelper.py 프로젝트: pierrebo/wallhack

    def __init__(self, dataDict, YCortisol, YTesto, YIgf1, ages, numProcesses=1, runCortisol=True, runTestosterone=True, runIGF1=True):
        """
        Create a new object for run the metabolomics experiments
        """
        self.dataDict = dataDict
        
        self.runCartTreeRank = False 
        self.runRbfSvmTreeRank = False 
        self.runL1SvmTreeRank = False
        self.runCartTreeRankForest = False 
        self.runRbfSvmTreeRankForest = False 
        self.runL1SvmTreeRankForest = False
        self.runRankBoost = False 
        self.runRankSVM = False 
        
        self.runCortisol = runCortisol 
        self.runTestosterone = runTestosterone 
        self.runIGF1 = runIGF1
        
        self.YCortisol = YCortisol 
        self.YTesto = YTesto 
        self.YIgf1 = YIgf1 
        self.ages = ages

        self.outerFolds = 3
        self.innerFolds = 5
        self.leafRankFolds = 3
        self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/"
        self.numProcesses = numProcesses

        #General params 
        Cs = 2.0**numpy.arange(-5, 7, 2, dtype=numpy.float)   
        gammas = 2.0**numpy.arange(-5, 3, 2, dtype=numpy.float)
        depths = numpy.array([2, 4, 8]) 
        numTrees = 20
        sampleSize = 1.0
        maxDepth = 10
        featureSize = 0.5 

        #CART TreeRank 
        leafRankParamDict = {} 
        leafRankParamDict["setMaxDepth"] = depths
        leafRankLearner = DecisionTree(leafRankParamDict, self.leafRankFolds)  
     
        self.cartTreeRank = TreeRank(leafRankLearner, numProcesses=numProcesses)
        self.cartTreeRankParams = {}
        self.cartTreeRankParams["setMaxDepth"] = depths
     
        #RBF SVM TreeRank 
        leafRankParamDict = {} 
        leafRankParamDict["setC"] = Cs  
        leafRankParamDict["setGamma"] =  gammas
        leafRankLearner = SVMLeafRank(leafRankParamDict, self.leafRankFolds) 
        leafRankLearner.setKernel("rbf")
        leafRankLearner.processes = 1
        
        self.rbfSvmTreeRank = TreeRank(leafRankLearner, numProcesses=numProcesses)
        self.rbfSvmTreeRankParams = {}
        self.rbfSvmTreeRankParams["setMaxDepth"] = depths
        
        #Linear L1 SVM TreeRank 
        leafRankParamDict = {} 
        leafRankParamDict["setC"] = Cs 
        leafRankLearner = SVMLeafRank(leafRankParamDict, self.leafRankFolds) 
        leafRankLearner.setKernel("linear")
        leafRankLearner.setPenalty("l1")
        leafRankLearner.processes = 1
        
        self.l1SvmTreeRank = TreeRank(leafRankLearner, numProcesses=numProcesses)
        self.l1SvmTreeRankParams = {}
        self.l1SvmTreeRankParams["setMaxDepth"] = depths       
        
        #CART TreeRankForest 
        leafRankParamDict = {} 
        leafRankParamDict["setMaxDepth"] = depths 
        leafRankLearner = DecisionTree(leafRankParamDict, self.leafRankFolds)  
        leafRankLearner.processes = 1
     
        self.cartTreeRankForest = TreeRankForest(leafRankLearner, numProcesses=numProcesses)
        self.cartTreeRankForest.setNumTrees(numTrees)
        self.cartTreeRankForest.setSampleSize(sampleSize)
        self.cartTreeRankForest.setFeatureSize(featureSize)
        self.cartTreeRankForestParams = {}
        self.cartTreeRankForestParams["setMaxDepth"] = numpy.array([maxDepth])   
        self.cartTreeRankForestParams["setSampleSize"] = numpy.array([0.5, 0.75, 1.0])
        self.cartTreeRankForestParams["setFeatureSize"] = numpy.array([0.5, 0.75, 1.0])
    
        #RBF SVM TreeRankForest 
        leafRankParamDict = {} 
        leafRankParamDict["setC"] = Cs  
        leafRankParamDict["setGamma"] =  gammas
        leafRankLearner = SVMLeafRank(leafRankParamDict, self.leafRankFolds) 
        leafRankLearner.setKernel("rbf")
        leafRankLearner.processes = 1
     
        self.rbfSvmTreeRankForest = TreeRankForest(leafRankLearner, numProcesses=numProcesses)
        self.rbfSvmTreeRankForest.setNumTrees(numTrees)
        self.rbfSvmTreeRankForest.setSampleSize(sampleSize)
        self.rbfSvmTreeRankForest.setFeatureSize(featureSize)
        self.rbfSvmTreeRankForestParams = {}
        self.rbfSvmTreeRankForestParams["setMaxDepth"] = numpy.array([maxDepth]) 
        self.rbfSvmTreeRankForestParams["setSampleSize"] = numpy.array([0.5, 0.75, 1.0])
        self.rbfSvmTreeRankForestParams["setFeatureSize"] = numpy.array([0.5, 0.75, 1.0])
    
        #L1 SVM TreeRankForest 
        leafRankParamDict = {} 
        leafRankParamDict["setC"] = Cs 
        leafRankLearner = SVMLeafRank(leafRankParamDict, self.leafRankFolds) 
        leafRankLearner.setKernel("linear")
        leafRankLearner.setPenalty("l1")  
        leafRankLearner.processes = 1
        
        self.l1SvmTreeRankForest = TreeRankForest(leafRankLearner, numProcesses=numProcesses)
        self.l1SvmTreeRankForest.setNumTrees(numTrees)
        self.l1SvmTreeRankForest.setSampleSize(sampleSize)
        self.l1SvmTreeRankForest.setFeatureSize(featureSize)
        self.l1SvmTreeRankForestParams = {}
        self.l1SvmTreeRankForestParams["setMaxDepth"] = numpy.array([maxDepth]) 
        self.l1SvmTreeRankForestParams["setSampleSize"] = numpy.array([0.5, 0.75, 1.0])
        self.l1SvmTreeRankForestParams["setFeatureSize"] = numpy.array([0.5, 0.75, 1.0])
    
        #RankBoost 
        self.rankBoost = RankBoost(numProcesses=numProcesses)
        self.rankBoostParams = {} 
        self.rankBoostParams["setIterations"] = numpy.array([10, 50, 100])
        self.rankBoostParams["setLearners"] = numpy.array([5, 10, 20])
        
        #RankSVM
        self.rankSVM = RankSVM(numProcesses=numProcesses)
        self.rankSVM.setKernel("rbf")
        self.rankSVMParams = {} 
        self.rankSVMParams["setC"] = 2.0**numpy.arange(0, 3, dtype=numpy.float)
        self.rankSVMParams["setGamma"] =  2.0**numpy.arange(-3, 0, dtype=numpy.float)

        #Store all the label vectors and their missing values
        self.hormoneDict = {}
        if self.runCortisol: 
            self.hormoneDict["Cortisol"] = YCortisol
        if self.runTestosterone: 
            self.hormoneDict["Testosterone"] = YTesto
        if self.runIGF1: 
            self.hormoneDict["IGF1"] = YIgf1

예제 #34

0

파일 보기

파일: PathDefaultsTest.py 프로젝트: spencer-ortega/DynamicGraph-AnomalyDetection

 def testGetOutputDir(self):
     print((PathDefaults.getOutputDir()))

예제 #35

0

파일 보기

파일: ProcessResults.py 프로젝트: malcolmreynolds/APGL

"""
Plot the ROC curves for the metabolomics experiment. 
"""
import sys
import numpy 
import logging
import matplotlib.pyplot as plt
from apgl.util.Util import Util
from apgl.util.PathDefaults import PathDefaults
from apgl.util.Latex import Latex 

logging.basicConfig(stream=sys.stdout, level=logging.WARN)
resultsDir = PathDefaults.getOutputDir() + "metabolomics/"
figureDir = resultsDir + "figures/"

labelNames = ["Testosterone.val_0", "Testosterone.val_1", "Testosterone.val_2"]
labelNames.extend(["Cortisol.val_0", "Cortisol.val_1", "Cortisol.val_2"])
labelNames.extend(["IGF1.val_0", "IGF1.val_1", "IGF1.val_2"])

labelNames2 = ["Testosterone.val", "Cortisol.val", "IGF1.val"]

algorithmNames = ["TreeRank"]
#algorithmNames = ["TreeRankForest"]
leafRankNames = ["CART", "SVM", "RBF-SVM", "LinearSVM-PCA"]
#leafRankNames = ["CARTF", "SVMF", "RBF-SVMF"]
dataTypes = ["raw_std", "log", "opls"]
#dataTypes = []

Ns = [10, 25, 50, 75, 100]
dataTypes.append("Db4")
dataTypes.append("Db8")

예제 #36

0

파일 보기

파일: ToyDataExp.py 프로젝트: pierrebo/wallhack

def runToyExp(datasetNames, sampleSizes, foldsSet, cvScalings, sampleMethods, numProcesses, fileNameSuffix):
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
    outputDir = PathDefaults.getOutputDir() + "modelPenalisation/"

    svm = LibSVM()
    numCs = svm.getCs().shape[0]
    numGammas = svm.getGammas().shape[0]
    numMethods = 1 + (1 + cvScalings.shape[0])
    numParams = 2

    runIdeal = True
    runCv = True
    runVfpen = True

    for i in range(len(datasetNames)):
        datasetName = datasetNames[i][0]
        numRealisations = datasetNames[i][1]
        logging.debug("Learning using dataset " + datasetName)

        for s in range(len(sampleMethods)):
            sampleMethod = sampleMethods[s][1]
            outfileName = outputDir + datasetName + sampleMethods[s][0] + fileNameSuffix

            fileLock = FileLock(outfileName + ".npz")
            if not fileLock.isLocked() and not fileLock.fileExists():
                fileLock.lock()
                errors = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods))
                params = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numParams))
                errorGrids = numpy.zeros(
                    (numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas)
                )
                approxGrids = numpy.zeros(
                    (numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas)
                )
                idealGrids = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numCs, numGammas))

                data = numpy.load(dataDir + datasetName + ".npz")
                gridPoints, trainX, trainY, pdfX, pdfY1X, pdfYminus1X = (
                    data["arr_0"],
                    data["arr_1"],
                    data["arr_2"],
                    data["arr_3"],
                    data["arr_4"],
                    data["arr_5"],
                )

                # We form a test set from the grid points
                testX = numpy.zeros((gridPoints.shape[0] ** 2, 2))
                for m in range(gridPoints.shape[0]):
                    testX[m * gridPoints.shape[0] : (m + 1) * gridPoints.shape[0], 0] = gridPoints
                    testX[m * gridPoints.shape[0] : (m + 1) * gridPoints.shape[0], 1] = gridPoints[m]

                for j in range(numRealisations):
                    Util.printIteration(j, 1, numRealisations, "Realisation: ")

                    for k in range(sampleSizes.shape[0]):
                        sampleSize = sampleSizes[k]
                        for m in range(foldsSet.shape[0]):
                            folds = foldsSet[m]
                            logging.debug("Using sample size " + str(sampleSize) + " and " + str(folds) + " folds")
                            perm = numpy.random.permutation(trainX.shape[0])
                            trainInds = perm[0:sampleSize]
                            validX = trainX[trainInds, :]
                            validY = trainY[trainInds]

                            svm = LibSVM(processes=numProcesses)
                            # Find ideal penalties
                            if runIdeal:
                                logging.debug("Finding ideal grid of penalties")
                                idealGrids[j, k, m, :, :] = parallelPenaltyGridRbf(
                                    svm, validX, validY, testX, gridPoints, pdfX, pdfY1X, pdfYminus1X
                                )

                            # Cross validation
                            if runCv:
                                logging.debug("Running V-fold cross validation")
                                methodInd = 0
                                idx = sampleMethod(folds, validY.shape[0])
                                if sampleMethod == Sampling.bootstrap:
                                    bootstrap = True
                                else:
                                    bootstrap = False

                                bestSVM, cvGrid = svm.parallelVfcvRbf(validX, validY, idx, True, bootstrap)
                                predY, decisionsY = bestSVM.predict(testX, True)
                                decisionGrid = numpy.reshape(
                                    decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F"
                                )
                                errors[j, k, m, methodInd] = ModelSelectUtils.bayesError(
                                    gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X
                                )
                                params[j, k, m, methodInd, :] = numpy.array([bestSVM.getC(), bestSVM.getKernelParams()])
                                errorGrids[j, k, m, methodInd, :, :] = cvGrid

                            # v fold penalisation
                            if runVfpen:
                                logging.debug("Running penalisation")
                                # BIC penalisation
                                Cv = float((folds - 1) * numpy.log(validX.shape[0]) / 2)
                                tempCvScalings = cvScalings * (folds - 1)
                                tempCvScalings = numpy.insert(tempCvScalings, 0, Cv)

                                # Use cross validation
                                idx = sampleMethod(folds, validY.shape[0])
                                svmGridResults = svm.parallelVfPenRbf(validX, validY, idx, tempCvScalings)

                                for n in range(len(tempCvScalings)):
                                    bestSVM, trainErrors, approxGrid = svmGridResults[n]
                                    methodInd = n + 1
                                    predY, decisionsY = bestSVM.predict(testX, True)
                                    decisionGrid = numpy.reshape(
                                        decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F"
                                    )
                                    errors[j, k, m, methodInd] = ModelSelectUtils.bayesError(
                                        gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X
                                    )
                                    params[j, k, m, methodInd, :] = numpy.array(
                                        [bestSVM.getC(), bestSVM.getKernelParams()]
                                    )
                                    errorGrids[j, k, m, methodInd, :, :] = trainErrors + approxGrid
                                    approxGrids[j, k, m, methodInd, :, :] = approxGrid

                meanErrors = numpy.mean(errors, 0)
                print(meanErrors)

                meanParams = numpy.mean(params, 0)
                print(meanParams)

                meanErrorGrids = numpy.mean(errorGrids, 0)
                stdErrorGrids = numpy.std(errorGrids, 0)

                meanIdealGrids = numpy.mean(idealGrids, 0)
                stdIdealGrids = numpy.std(idealGrids, 0)

                meanApproxGrids = numpy.mean(approxGrids, 0)
                stdApproxGrids = numpy.std(approxGrids, 0)

                numpy.savez(
                    outfileName,
                    errors,
                    params,
                    meanErrorGrids,
                    stdErrorGrids,
                    meanIdealGrids,
                    stdIdealGrids,
                    meanApproxGrids,
                    stdApproxGrids,
                )
                logging.debug("Saved results as file " + outfileName + ".npz")
                fileLock.unlock()
            else:
                logging.debug("Results already computed")

    logging.debug("All done!")