def __init__(self, field): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "dblp/" self.xmlFileName = dataDir + "dblp.xml" self.xmlCleanFilename = dataDir + "dblpClean.xml" resultsDir = PathDefaults.getDataDir() + "reputation/" + field + "/" self.expertsFileName = resultsDir + "experts.txt" self.expertMatchesFilename = resultsDir + "experts_matches.csv" self.trainExpertMatchesFilename = resultsDir + "experts_train_matches.csv" self.testExpertMatchesFilename = resultsDir + "experts_test_matches.csv" self.coauthorsFilename = resultsDir + "coauthors.csv" self.publicationsFilename = resultsDir + "publications.csv" self.stepSize = 100000 self.numLines = 33532888 self.publicationTypes = set(["article" , "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"]) self.p = 0.5 self.matchCutoff = 0.95 self.cleanXML() self.matchExperts() logging.warning("Now you must disambiguate the matched experts if not ready done")
def processSimpleDataset(name, numRealisations, split, ext=".csv", delimiter=",", usecols=None, skiprows=1, converters=None): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" fileName = dataDir + name + ext print("Loading data from file " + fileName) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "/" XY = numpy.loadtxt(fileName, delimiter=delimiter, skiprows=skiprows, usecols=usecols, converters=converters) X = XY[:, :-1] y = XY[:, -1] idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split) preprocessSave(X, y, outputDir, idx)
def testGenerateRandomGraph(self): egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv" alterFileName = PathDefaults.getDataDir() + "infoDiffusion/AlterData.csv" numVertices = 1000 infoProb = 0.1 p = 0.1 neighbours = 10 generator = SmallWorldGenerator(p, neighbours) graph = SparseGraph(VertexList(numVertices, 0)) graph = generator.generate(graph) self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph)
def __init__(self): self.labelNames = ["Cortisol.val", "Testosterone.val", "IGF1.val"] self.dataDir = PathDefaults.getDataDir() + "metabolomic/" self.boundsDict = {} self.boundsDict["Cortisol"] = numpy.array([0, 89, 225, 573]) self.boundsDict["Testosterone"] = numpy.array([0, 3, 9, 13]) self.boundsDict["IGF1"] = numpy.array([0, 200, 441, 782])
def testComputeIdealPenalty(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] sampleSize = 100 trainX, trainY = X[0:sampleSize, :], y[0:sampleSize] testX, testY = X[sampleSize:, :], y[sampleSize:] #We form a test set from the grid points fullX = numpy.zeros((gridPoints.shape[0]**2, 2)) for m in range(gridPoints.shape[0]): fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m] C = 1.0 gamma = 1.0 args = (trainX, trainY, fullX, C, gamma, gridPoints, pdfX, pdfY1X, pdfYminus1X) penalty = computeIdealPenalty(args) #Now compute penalty using data args = (trainX, trainY, testX, testY, C, gamma) penalty2 = computeIdealPenalty2(args) self.assertAlmostEquals(penalty2, penalty, 2)
def main(argv=None): if argv is None: argv = sys.argv try: # read options try: opts, args = getopt.getopt(argv[1:], "hd:n:D", ["help", "dir=", "nb_user="******"debug"]) except getopt.error as msg: raise RGUsage(msg) # apply options dir = PathDefaults.getDataDir() + "cluster/" nb_user = None log_level = logging.INFO for o, a in opts: if o in ("-h", "--help"): print(__doc__) return 0 elif o in ("-d", "--dir"): dir = a elif o in ("-n", "--nb_user"): nb_user = int(a) elif o in ("-D", "--debug"): log_level = logging.DEBUG logging.basicConfig(stream=sys.stdout, level=log_level, format='%(levelname)s (%(asctime)s):%(message)s') # process: generate data files BemolData.generate_data_file(dir, nb_user) except RGUsage as err: logging.error(err.msg) logging.error("for help use --help") return 2
def testToyData(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] pxSum = 0 pY1XSum = 0 pYminus1XSum = 0 px2Sum = 0 squareArea = (gridPoints[1]-gridPoints[0])**2 for i in range(gridPoints.shape[0]-1): for j in range(gridPoints.shape[0]-1): px = (pdfX[i,j]+pdfX[i+1,j]+pdfX[i, j+1]+pdfX[i+1, j+1])/4 pxSum += px*squareArea pY1X = (pdfY1X[i,j]+pdfY1X[i+1,j]+pdfY1X[i, j+1]+pdfY1X[i+1, j+1])/4 pY1XSum += pY1X*squareArea pYminus1X = (pdfYminus1X[i,j]+pdfYminus1X[i+1,j]+pdfYminus1X[i, j+1]+pdfYminus1X[i+1, j+1])/4 pYminus1XSum += pYminus1X*squareArea px2Sum += px*pY1X*squareArea + px*pYminus1X*squareArea self.assertAlmostEquals(pxSum, 1) print(pY1XSum) print(pYminus1XSum) self.assertAlmostEquals(px2Sum, 1)
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) maxDepths = range(3, 10) trainAucs = numpy.array([0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508]) testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRank = TreeRank(self.leafRanklearner) treeRank.setMaxDepth(maxDepth) treeRank.learnModel(X, y) trainScores = treeRank.predict(X) testScores = treeRank.predict(testX) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i+=1
def testEdgeFile(self): """ Figure out the problem with the edge file """ dataDir = PathDefaults.getDataDir() + "cluster/" edgesFilename = dataDir + "Cit-HepTh.txt" edges = {} file = open(edgesFilename, 'r') file.readline() file.readline() file.readline() file.readline() vertices = {} for line in file: (vertex1, sep, vertex2) = line.partition("\t") vertex1 = vertex1.strip() vertex2 = vertex2.strip() edges[(vertex1, vertex2)] = 0 vertices[vertex1] = 0 vertices[vertex2] = 0 #It says there are 352807 edges in paper and 27770 vertices self.assertEquals(len(edges), 352807) self.assertEquals(len(vertices), 27770)
def testGraphFromMatFile(self): matFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoAlterTransmissions1000.mat" sGraph = EgoUtils.graphFromMatFile(matFileName) examplesList = ExamplesList.readFromMatFile(matFileName) numFeatures = examplesList.getDataFieldSize("X", 1) self.assertEquals(examplesList.getNumExamples(), sGraph.getNumEdges()) self.assertEquals(examplesList.getNumExamples()*2, sGraph.getNumVertices()) self.assertEquals(numFeatures/2+1, sGraph.getVertexList().getNumFeatures()) #Every even vertex has information, odd does not for i in range(0, sGraph.getNumVertices()): vertex = sGraph.getVertex(i) if i%2 == 0: self.assertEquals(vertex[sGraph.getVertexList().getNumFeatures()-1], 1) else: self.assertEquals(vertex[sGraph.getVertexList().getNumFeatures()-1], 0) #Test the first few vertices are the same for i in range(0, 10): vertex1 = sGraph.getVertex(i*2)[0:numFeatures/2] vertex2 = sGraph.getVertex(i*2+1)[0:numFeatures/2] vertexEx1 = examplesList.getSubDataField("X", numpy.array([i])).ravel()[0:numFeatures/2] vertexEx2 = examplesList.getSubDataField("X", numpy.array([i])).ravel()[numFeatures/2:numFeatures] self.assertTrue((vertex1 == vertexEx1).all()) self.assertTrue((vertex2 == vertexEx2).all())
def __init__(self, maxIter=None, iterStartTimeStamp=None): outputDir = PathDefaults.getOutputDir() + "recommend/erasm/" if not os.path.exists(outputDir): os.mkdir(outputDir) #iterStartDate is the starting date of the iterator if iterStartTimeStamp != None: self.iterStartTimeStamp = iterStartTimeStamp else: self.iterStartTimeStamp = 1286229600 self.timeStep = timedelta(30).total_seconds() self.ratingFileName = outputDir + "data.npz" self.userDictFileName = outputDir + "userIdDict.pkl" self.groupDictFileName = outputDir + "groupIdDict.pkl" self.isTrainRatingsFileName = outputDir + "is_train.npz" self.dataDir = PathDefaults.getDataDir() + "erasm/" self.dataFileName = self.dataDir + "groupMembers-29-11-12" self.maxIter = maxIter self.trainSplit = 4.0/5 self.processRatings() self.splitDataset() self.loadProcessedData()
def processRatings(self): """ Convert the dataset into a matrix and save the results for faster access. """ if not os.path.exists(self.ratingFileName) or not os.path.exists(self.custDictFileName): dataDir = PathDefaults.getDataDir() + "netflix/training_set/" logging.debug("Processing ratings given in " + dataDir) custIdDict = {} custIdSet = set([]) movieIds = array.array("I") custIds = array.array("I") ratings = array.array("B") dates = array.array("L") j = 0 for i in range(self.startMovieID, self.endMovieID+1): Util.printIteration(i-1, 1, self.endMovieID-1) ratingsFile = open(dataDir + "mv_" + str(i).zfill(7) + ".txt") ratingsFile.readline() for line in ratingsFile: vals = line.split(",") custId = int(vals[0]) if custId not in custIdSet: custIdSet.add(custId) custIdDict[custId] = j custInd = j j += 1 else: custInd = custIdDict[custId] rating = int(vals[1]) t = datetime.strptime(vals[2].strip(), "%Y-%m-%d") movieIds.append(i-1) custIds.append(custInd) ratings.append(rating) dates.append(int(time.mktime(t.timetuple()))) movieIds = numpy.array(movieIds, numpy.uint32) custIds = numpy.array(custIds, numpy.uint32) ratings = numpy.array(ratings, numpy.uint8) dates = numpy.array(dates, numpy.uint32) assert ratings.shape[0] == self.numRatings numpy.savez(self.ratingFileName, movieIds, custIds, ratings, dates) logging.debug("Saved ratings file as " + self.ratingFileName) pickle.dump(custIdDict, open(self.custDictFileName, 'wb')) logging.debug("Saved custIdDict as " + self.custDictFileName) else: logging.debug("Ratings file " + str(self.ratingFileName) + " already processed")
def loadData(): """ Return the raw spectra and the MDS transformed data as well as the DataFrame for the MDS data. """ utilsLib = importr('utils') dataDir = PathDefaults.getDataDir() + "metabolomic/" fileName = dataDir + "data.RMN.total.6.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") maxNMRIndex = 951 X = df.rx(robjects.IntVector(range(1, maxNMRIndex))) X = numpy.array(X).T #Load age and normalise (missing values are assinged the mean) ages = numpy.array(df.rx(robjects.StrVector(["Age"]))).ravel() meanAge = numpy.mean(ages[numpy.logical_not(numpy.isnan(ages))]) ages[numpy.isnan(ages)] = meanAge ages = Standardiser().standardiseArray(ages) Xs = X.copy() standardiser = Standardiser() Xs = standardiser.standardiseArray(X) fileName = dataDir + "data.sportsmen.log.AP.1.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") maxNMRIndex = 419 X2 = df.rx(robjects.IntVector(range(1, maxNMRIndex))) X2 = numpy.array(X2).T #Load the OPLS corrected files fileName = dataDir + "IGF1.log.OSC.1.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") minNMRIndex = 22 maxNMRIndex = 441 Xopls1 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex))) Xopls1 = numpy.array(Xopls1).T fileName = dataDir + "cort.log.OSC.1.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") minNMRIndex = 20 maxNMRIndex = 439 Xopls2 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex))) Xopls2 = numpy.array(Xopls2).T fileName = dataDir + "testo.log.OSC.1.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") minNMRIndex = 22 maxNMRIndex = 441 Xopls3 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex))) Xopls3 = numpy.array(Xopls3).T #Let's load all the label data here labelNames = MetabolomicsUtils.getLabelNames() YList = MetabolomicsUtils.createLabelList(df, labelNames) return X, X2, Xs, (Xopls1, Xopls2, Xopls3), YList, ages, df
def getIterator(): dataDir = PathDefaults.getDataDir() + "cluster/" nbUser = 10000 # set to 'None' to have all users nbPurchasesPerIt = 500 # set to 'None' to take all the purchases per date startingIteration = 300 endingIteration = 600 # set to 'None' to have all iterations stepSize = 1 return itertools.islice(BemolData.getGraphIterator(dataDir, nbUser, nbPurchasesPerIt), startingIteration, endingIteration, stepSize)
def profileClusterFromIterator(self): iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList) dataDir = PathDefaults.getDataDir() + "cluster/" #iterator = getBemolGraphIterator(dataDir) def run(): clusterList, timeList, boundList = self.clusterer.clusterFromIterator(iterator, verbose=True) print(timeList.cumsum(0)) ProfileUtils.profile('run()', globals(), locals())
def testReadGraph(self): fileName = PathDefaults.getDataDir() + "test/simpleGraph.txt" graphReader = SimpleGraphReader() graph = graphReader.readFromFile(fileName) logging.debug((graph.getAllEdges())) self.assertEquals(graph.isUndirected(), True) self.assertEquals(graph.getNumVertices(), 5) self.assertEquals(graph.getNumEdges(), 4) self.assertEquals(graph.getEdge(0, 1), 1) self.assertEquals(graph.getEdge(2, 4), 1) self.assertEquals(graph.getEdge(2, 2), 1) self.assertEquals(graph.getEdge(4, 0), 1) #Now test reading a file with the same graph but vertices indexed differently fileName = PathDefaults.getDataDir() + "test/simpleGraph2.txt" graph = graphReader.readFromFile(fileName) self.assertEquals(graph.isUndirected(), True) self.assertEquals(graph.getNumVertices(), 5) self.assertEquals(graph.getNumEdges(), 4) self.assertEquals(graph.getEdge(0, 1), 1.1) self.assertEquals(graph.getEdge(2, 4), 1) self.assertEquals(graph.getEdge(2, 2), 1.6) self.assertEquals(graph.getEdge(4, 0), 1) #Now test a file with directed edges fileName = PathDefaults.getDataDir() + "test/simpleGraph3.txt" graph = graphReader.readFromFile(fileName) self.assertEquals(graph.isUndirected(), False) self.assertEquals(graph.getNumVertices(), 5) self.assertEquals(graph.getNumEdges(), 4) self.assertEquals(graph.getEdge(0, 1), 1) self.assertEquals(graph.getEdge(2, 4), 1) self.assertEquals(graph.getEdge(2, 2), 1) self.assertEquals(graph.getEdge(4, 0), 1)
def __init__(self): dataDir = PathDefaults.getDataDir() + "cluster/" nbUser = 2000 # set to 'None' to have all users nbPurchasesPerIt = 50 # set to 'None' to take all the purchases # per date startingIteration = 20 endingIteration = None # set to 'None' to have all iterations stepSize = 10 iterator = itertools.islice(BemolData.getGraphIterator(dataDir, nbUser, nbPurchasesPerIt), startingIteration, endingIteration, stepSize) self.iterator = iterator
def testGetTrainIteratorFunc(self): dataFilename = PathDefaults.getDataDir() + "reference/author_document_count" dataset = Static2IdValDataset(dataFilename) trainIterator = dataset.getTrainIteratorFunc()() testIterator = dataset.getTestIteratorFunc()() for trainX in trainIterator: testX = testIterator.next() print(trainX.shape, trainX.nnz, testX.nnz) self.assertEquals(trainX.shape, testX.shape)
def testCreateIndicatorLabels(self): metaUtils = MetabolomicsUtils() X, XStd, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData() YCortisol = YCortisol[numpy.logical_not(numpy.isnan(YCortisol))] YCortisolIndicators = metaUtils.createIndicatorLabel(YCortisol, metaUtils.boundsDict["Cortisol"]) YTesto = YTesto[numpy.logical_not(numpy.isnan(YTesto))] YTestoIndicators = metaUtils.createIndicatorLabel(YTesto, metaUtils.boundsDict["Testosterone"]) YIgf1 = YIgf1[numpy.logical_not(numpy.isnan(YIgf1))] YIgf1Indicators = metaUtils.createIndicatorLabel(YIgf1, metaUtils.boundsDict["IGF1"]) s = numpy.sum(YCortisolIndicators, 1) nptst.assert_array_equal(s, numpy.ones(s.shape[0])) s = numpy.sum(YTestoIndicators, 1) nptst.assert_array_equal(s, numpy.ones(s.shape[0])) s = numpy.sum(YIgf1Indicators, 1) nptst.assert_array_equal(s, numpy.ones(s.shape[0])) #Now compare to those labels in the file X, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData() dataDir = PathDefaults.getDataDir() + "metabolomic/" fileName = dataDir + "data.RMN.total.6.txt" data = pandas.read_csv(fileName, delimiter=",") YCortisolIndicators = metaUtils.createIndicatorLabel(YCortisol, metaUtils.boundsDict["Cortisol"]) YCortisolIndicators2 = numpy.array(data[["Ind.Cortisol.1", "Ind.Cortisol.2", "Ind.Cortisol.3"]]) for i in range(YCortisolIndicators.shape[0]): if not numpy.isnan(YCortisol[i]) and not numpy.isnan(YCortisolIndicators2[i, :]).any(): #nptst.assert_almost_equal(YCortisolIndicators2[i, :], YCortisolIndicators[i, :]) pass YTestoIndicators = metaUtils.createIndicatorLabel(YTesto, metaUtils.boundsDict["Testosterone"]) YTestoIndicators2 = numpy.array(data[["Ind.Testo.1", "Ind.Testo.2", "Ind.Testo.3"]]) for i in range(YTestoIndicators.shape[0]): if not numpy.isnan(YTesto[i]) and not numpy.isnan(YTestoIndicators2[i, :]).any(): #print(i, YTesto[i]) nptst.assert_almost_equal(YTestoIndicators2[i, :], YTestoIndicators[i, :]) YIgf1Indicators = metaUtils.createIndicatorLabel(YIgf1, metaUtils.boundsDict["IGF1"]) YIgf1Indicators2 = numpy.array(data[["Ind.IGF1.1", "Ind.IGF1.2", "Ind.IGF1.3"]]) for i in range(YIgf1Indicators.shape[0]): if not numpy.isnan(YIgf1[i]) and not numpy.isnan(YIgf1Indicators2[i, :]).any(): #print(i, YIgf1[i]) #nptst.assert_almost_equal(YIgf1Indicators2[i, :], YIgf1Indicators[i, :]) pass
def processParkinsonsDataset(name, numRealisations): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" fileName = dataDir + name + ".data" XY = numpy.loadtxt(fileName, delimiter=",", skiprows=1) inds = list(set(range(XY.shape[1])) - set([5, 6])) X = XY[:, inds] y1 = XY[:, 5] y2 = XY[:, 6] #We don't keep whole collections of patients split = 0.5 idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-motor/" preprocessSave(X, y1, outputDir, idx) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-total/" preprocessSave(X, y2, outputDir, idx)
def testBayesError(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] sampleSize = 100 trainX, trainY = X[0:sampleSize, :], y[0:sampleSize] testX, testY = X[sampleSize:, :], y[sampleSize:] #We form a test set from the grid points gridX = numpy.zeros((gridPoints.shape[0]**2, 2)) for m in range(gridPoints.shape[0]): gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m] Cs = 2**numpy.arange(-5, 5, dtype=numpy.float) gammas = 2**numpy.arange(-5, 5, dtype=numpy.float) bestError = 1 for C in Cs: for gamma in gammas: svm = LibSVM(kernel="gaussian", C=C, kernelParam=gamma) svm.learnModel(trainX, trainY) predY, decisionsY = svm.predict(gridX, True) decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F") error = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X) predY, decisionsY = svm.predict(testX, True) error2 = Evaluator.binaryError(testY, predY) print(error, error2) if error < bestError: error = bestError bestC = C bestGamma = gamma svm = LibSVM(kernel="gaussian", C=bestC, kernelParam=bestGamma) svm.learnModel(trainX, trainY) predY, decisionsY = svm.predict(gridX, True) plt.figure(0) plt.contourf(gridPoints, gridPoints, decisionGrid, 100) plt.colorbar() plt.figure(1) plt.scatter(X[y==1, 0], X[y==1, 1], c='r' ,label="-1") plt.scatter(X[y==-1, 0], X[y==-1, 1], c='b',label="+1") plt.legend() plt.show()
def testReadFromCsvFile(self): dir = PathDefaults.getDataDir() + "test/" fileName = dir + "examplesList1.csv" examplesList = ExamplesList.readFromCsvFile(fileName) X = examplesList.getDataField(examplesList.getDefaultExamplesName()) y = examplesList.getDataField(examplesList.getLabelsName()) X2 = numpy.array([[10, 2], [4, -6], [24, 6]]) y2 = numpy.array([[-1], [1], [-1]]) self.assertTrue((X==X2).all()) self.assertTrue((y==y2).all())
def cluster(): k1 = 20 # numCluster to learn k2 = 40 # numEigenVector kept dir = PathDefaults.getDataDir() + "cluster/" graphIterator = getBemolGraphIterator(dir) #=========================================== # cluster print("compute clusters") clusterer = IterativeSpectralClustering(k1, k2) clustersList = clusterer.clusterFromIterator(graphIterator, True) for i in range(len(clustersList)): clusters = clustersList[i] print(clusters)
def testRunSimulation(self): egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv" alterFileName = PathDefaults.getDataDir() + "infoDiffusion/AlterData.csv" numVertices = 1000 infoProb = 0.1 p = 0.1 neighbours = 10 generator = SmallWorldGenerator(p, neighbours) graph = SparseGraph(VertexList(numVertices, 0)) graph = generator.generate(graph) CVal = 1.0 kernel = "linear" kernelParamVal = 0.0 errorCost = 0.5 folds = 6 sampleSize = 1000 maxIterations = 5 self.svmEgoSimulator.trainClassifier(CVal, kernel, kernelParamVal, errorCost, sampleSize) self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph) self.svmEgoSimulator.runSimulation(maxIterations)
def readHIVGraph(self, undirected=True, indicators=True): """ We will use pacdate5389.csv which contains the data of infection. The undirected parameter instructs whether to create an undirected graph. If indicators is true then categorical varibles are turned into collections of indicator ones. """ converters = {1: CsvConverters.dateConv, 3:CsvConverters.dateConv, 5:CsvConverters.detectionConv, 6:CsvConverters.provConv, 8: CsvConverters.dateConv } converters[9] = CsvConverters.genderConv converters[10] = CsvConverters.orientConv converters[11] = CsvConverters.numContactsConv converters[12] = CsvConverters.numContactsConv converters[13] = CsvConverters.numContactsConv def nanProcessor(X): means = numpy.zeros(X.shape[1]) for i in range(X.shape[1]): if numpy.sum(numpy.isnan(X[:, i])) > 0: logging.info("No. missing values in " + str(i) + "th column: " + str(numpy.sum(numpy.isnan(X[:, i])))) means[i] = numpy.mean(X[:, i][numpy.isnan(X[:, i]) == False]) X[numpy.isnan(X[:, i]), i] = means[i] return X idIndex = 0 featureIndices = converters.keys() multiGraphCsvReader = MultiGraphCsvReader(idIndex, featureIndices, converters, nanProcessor) dataDir = PathDefaults.getDataDir() vertexFileName = dataDir + "HIV/alldata.csv" edgeFileNames = [dataDir + "HIV/grafdet2.csv", dataDir + "HIV/infect2.csv"] sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames, undirected, delimiter="\t") #For learning purposes we will convert categorial variables into a set of #indicator features if indicators: logging.info("Converting categorial features") vList = sparseMultiGraph.getVertexList() V = vList.getVertices(list(range(vList.getNumVertices()))) catInds = [2, 3] generator = FeatureGenerator() V = generator.categoricalToIndicator(V, catInds) vList.replaceVertices(V) logging.info("Created " + str(sparseMultiGraph.getNumVertices()) + " examples with " + str(sparseMultiGraph.getVertexList().getNumFeatures()) + " features") return sparseMultiGraph
def __init__(self, maxIter=None, iterStartTimeStamp=None): """ Return a training and test set for netflix based on the time each rating was made. There are 62 iterations. """ self.timeStep = timedelta(30).total_seconds() #startDate is used to convert dates into ints #self.startDate = datetime(1998,1,1) #self.endDate = datetime(2005,12,31) #iterStartDate is the starting date of the iterator if iterStartTimeStamp != None: self.iterStartTimeStamp = iterStartTimeStamp else: self.iterStartTimeStamp = time.mktime(datetime(2001,1,1).timetuple()) self.startMovieID = 1 self.endMovieID = 17770 self.numMovies = 17770 self.numRatings = 100480507 self.numProbeMovies = 16938 self.numProbeRatings = 1408395 self.numCustomers = 480189 outputDir = PathDefaults.getOutputDir() + "recommend/netflix/" if not os.path.exists(outputDir): os.mkdir(outputDir) self.ratingFileName = outputDir + "data.npz" self.custDictFileName = outputDir + "custIdDict.pkl" self.probeFileName = PathDefaults.getDataDir() + "netflix/probe.txt" self.testRatingsFileName = outputDir + "test_data.npz" self.isTrainRatingsFileName = outputDir + "is_train.npz" self.maxIter = maxIter self.trainSplit = 4.0/5 self.processRatings() #self.processProbe() self.splitDataset() self.loadProcessedData() if self.maxIter != None: logging.debug("Maximum number of iterations: " + str(self.maxIter))
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] y = y*2 - 1 fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] testY = testY*2-1 X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) numTrees = 5 minSplit = 50 maxDepths = range(3, 10) trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347]) testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRankForest = TreeRankForest(self.leafRanklearner) treeRankForest.setMaxDepth(maxDepth) treeRankForest.setMinSplit(minSplit) treeRankForest.setNumTrees(numTrees) treeRankForest.learnModel(X, y) trainScores = treeRankForest.predict(X) testScores = treeRankForest.predict(testX) print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY)) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i+=1
def testPredict2(self): #We play around with parameters to maximise AUC on the IGF1_0-Haar data dataDir = PathDefaults.getDataDir() fileName = dataDir + "IGF1_0-Haar.npy" XY = numpy.load(fileName) X = XY[:, 0:XY.shape[1]-1] y = XY[:, XY.shape[1]-1].ravel() weight = numpy.bincount(numpy.array(y, numpy.int))[0]/float(y.shape[0]) #weight = 0.5 #weight = 0.9 folds = 3 randomForest = RandomForest() randomForest.setWeight(weight) randomForest.setMaxDepth(50) #randomForest.setMinSplit(100) mean, var = randomForest.evaluateCv(X, y, folds, Evaluator.auc) logging.debug("AUC = " + str(mean)) logging.debug("Var = " + str(var))
def computeLearningRates(datasetNames, numProcesses, fileNameSuffix, learnerName, sampleSizes, foldsSet): dataDir = PathDefaults.getDataDir() + "modelPenalisation/" outputDir = PathDefaults.getOutputDir() + "modelPenalisation/" learner, loadMethod, dataDir, outputDir, paramDict = getSetup(learnerName, dataDir, outputDir, numProcesses) for i in range(len(datasetNames)): logging.debug("Learning using dataset " + datasetNames[i][0]) outfileName = outputDir + datasetNames[i][0] + fileNameSuffix fileLock = FileLock(outfileName + ".npz") if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() numRealisations = datasetNames[i][1] gridShape = [numRealisations, sampleSizes.shape[0]] gridShape.extend(list(learner.gridShape(paramDict))) gridShape = tuple(gridShape) betaGrids = numpy.zeros(gridShape) for k in range(sampleSizes.shape[0]): sampleSize = sampleSizes[k] logging.debug("Using sample size " + str(sampleSize)) for j in range(numRealisations): Util.printIteration(j, 1, numRealisations, "Realisation: ") trainX, trainY, testX, testY = loadMethod(dataDir, datasetNames[i][0], j) numpy.random.seed(21) trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize] validX = trainX[trainInds,:] validY = trainY[trainInds] betaGrids[j, k, :] = learner.learningRate(validX, validY, foldsSet, paramDict) numpy.savez(outfileName, betaGrids) logging.debug("Saved results as file " + outfileName + ".npz") fileLock.unlock()
def testReadFile(self): eCsvReader = EgoCsvReader() #logging.debug(os.getcwd()) dir = PathDefaults.getDataDir() fileName = dir + "test/TestData.csv" questionIds = [("Q14", 0), ("Q12", 1) , ("Q2", 0)] missing = 1 (X, titles) = eCsvReader.readFile(fileName, questionIds, missing) X2 = numpy.zeros((10, 3)) X2[0, :] = [0.621903386,0.608560354,0.33290608] X2[1, :] = [0.318548924,0.402390713,0.129956291] X2[2, :] = [0.956658404,0.344317772,0.680386616] X2[3, :] = [0.267607668,0.119647983,0.116893619] X2[4, :] = [0.686589498,0.402390713,0.426789174] X2[5, :] = [0.373575769,0.025846789,0.797125005] X2[6, :] = [0.493793948,0.402390713,0.990507109] X2[7, :] = [0.524534585,0.525169385,0.772917183] X2[8, :] = [0.339055395,0.402390713,0.684788001] X2[9, :] = [0.997774183,0.790801992,0.643252009] self.assertAlmostEquals(numpy.linalg.norm(X-X2),0, places=6)
def testReadGraph(self): dir = PathDefaults.getDataDir() vertexFileName = dir + "test/deggraf10.csv" edgeFileNames = [dir + "test/testEdges1.csv", dir + "test/testEdges2.csv"] def genderConv(x): genderDict = {'"M"': 0, '"F"': 1} return genderDict[x] def orientConv(x): orientDict = {'"HT"': 0, '"HB"': 1} return orientDict[x] def fteConv(x): fteDict = {'"INTER"': 0, '"CONTA"': 1} return fteDict[x] def provConv(x): provDict = {'"CH"': 0, '"SC"': 1, '"SS"': 2, '"LH"' : 3, '"GM"' : 4} return provDict[x] converters = {3: genderConv, 4: orientConv, 5:fteConv, 6:provConv} idIndex = 0 featureIndices = list(range(1,11)) multiGraphCsvReader = MultiGraphCsvReader(idIndex, featureIndices, converters) sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames) vertexValues = numpy.zeros((10, 10)) vertexValues[0, :] = numpy.array([1986, 32, 0, 0, 0, 0, 0, 3, 3, 1]) vertexValues[1, :] = numpy.array([1986, 27, 0, 0, 0, 1, 0, 4, 4, 1]) vertexValues[2, :] = numpy.array([1986, 20, 0, 0, 0, 1, 0, 1, 1, 0]) vertexValues[3, :] = numpy.array([1986, 20, 0, 0, 0, 1, 0, 2, 2, 0]) vertexValues[4, :] = numpy.array([1986, 20, 0, 0, 0, 2, 0, 5, 5, 0]) vertexValues[5, :] = numpy.array([1986, 28, 0, 0, 0, 3, 0, 1, 1, 1]) vertexValues[6, :] = numpy.array([1986, 26, 1, 0, 1, 3, 6, 1, 1, 1]) vertexValues[7, :] = numpy.array([1986, 35, 0, 0, 0, 2, 0, 0, 0, 0]) vertexValues[8, :] = numpy.array([1986, 37, 0, 1, 0, 3, 0, 5, 3, 0]) vertexValues[9, :] = numpy.array([1986, 40, 0, 1, 0, 4, 0, 3, 3, 0]) #Check if the values of the vertices are correct for i in range(sparseMultiGraph.getNumVertices()): self.assertTrue((sparseMultiGraph.getVertex(i) == vertexValues[i]).all()) #Now check edges edges = numpy.zeros((10, 3)) edges[0, :] = numpy.array([4, 0, 0]) edges[1, :] = numpy.array([4, 1, 0]) edges[2, :] = numpy.array([5, 3, 0]) edges[3, :] = numpy.array([7, 1, 0]) edges[4, :] = numpy.array([8, 0, 0]) edges[5, :] = numpy.array([4, 1, 1]) edges[6, :] = numpy.array([8, 1, 1]) edges[7, :] = numpy.array([8, 2, 1]) edges[8, :] = numpy.array([8, 4, 1]) edges[9, :] = numpy.array([9, 0, 1]) self.assertTrue((sparseMultiGraph.getAllEdges() == edges).all()) #Now test directed graphs sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames, False) for i in range(sparseMultiGraph.getNumVertices()): self.assertTrue((sparseMultiGraph.getVertex(i) == vertexValues[i]).all()) edges = numpy.zeros((10, 3)) edges[0, :] = numpy.array([0, 4, 0]) edges[1, :] = numpy.array([1, 7, 0]) edges[2, :] = numpy.array([3, 5, 0]) edges[3, :] = numpy.array([4, 1, 0]) edges[4, :] = numpy.array([8, 0, 0]) edges[5, :] = numpy.array([0, 9, 1]) edges[6, :] = numpy.array([1, 8, 1]) edges[7, :] = numpy.array([2, 8, 1]) edges[8, :] = numpy.array([4, 1, 1]) edges[9, :] = numpy.array([8, 4, 1]) self.assertTrue((sparseMultiGraph.getAllEdges() == edges).all())
def testMDLGraphsReader(self): reader = MDLGraphsReader() dir = PathDefaults.getDataDir() fileName = dir + "test/testGraphs.mdl" graphs = reader.readFromFile(fileName) self.assertEquals(len(graphs), 2) #Check the first graph self.assertEquals(graphs[0].getNumVertices(), 26) self.assertEquals(graphs[0].getNumEdges(), 28) def getEdge(graph, i, j): return graph.getEdge(i - 1, j - 1) self.assertEquals(getEdge(graphs[0], 1, 6), 1) self.assertEquals(getEdge(graphs[0], 1, 2), 1) self.assertEquals(getEdge(graphs[0], 1, 18), 1) self.assertEquals(getEdge(graphs[0], 2, 3), 1) self.assertEquals(getEdge(graphs[0], 2, 19), 1) self.assertEquals(getEdge(graphs[0], 3, 4), 1) self.assertEquals(getEdge(graphs[0], 3, 20), 1) self.assertEquals(getEdge(graphs[0], 4, 10), 1) self.assertEquals(getEdge(graphs[0], 4, 5), 1) self.assertEquals(getEdge(graphs[0], 5, 6), 1) self.assertEquals(getEdge(graphs[0], 5, 7), 1) self.assertEquals(getEdge(graphs[0], 6, 21), 1) self.assertEquals(getEdge(graphs[0], 7, 8), 1) self.assertEquals(getEdge(graphs[0], 7, 22), 1) self.assertEquals(getEdge(graphs[0], 8, 9), 1) self.assertEquals(getEdge(graphs[0], 8, 23), 1) self.assertEquals(getEdge(graphs[0], 9, 14), 1) self.assertEquals(getEdge(graphs[0], 9, 10), 1) self.assertEquals(getEdge(graphs[0], 10, 11), 1) self.assertEquals(getEdge(graphs[0], 11, 12), 1) self.assertEquals(getEdge(graphs[0], 11, 24), 1) self.assertEquals(getEdge(graphs[0], 12, 13), 1) self.assertEquals(getEdge(graphs[0], 12, 25), 1) self.assertEquals(getEdge(graphs[0], 13, 14), 1) self.assertEquals(getEdge(graphs[0], 13, 15), 1) self.assertEquals(getEdge(graphs[0], 14, 26), 1) self.assertEquals(getEdge(graphs[0], 15, 16), 1) self.assertEquals(getEdge(graphs[0], 15, 17), 1) #Check the second graph self.assertEquals(graphs[1].getNumVertices(), 19) self.assertEquals(graphs[1].getNumEdges(), 20) self.assertEquals(getEdge(graphs[1], 1, 10), 1) self.assertEquals(getEdge(graphs[1], 1, 2), 1) self.assertEquals(getEdge(graphs[1], 1, 14), 1) self.assertEquals(getEdge(graphs[1], 2, 3), 1) self.assertEquals(getEdge(graphs[1], 2, 15), 1) self.assertEquals(getEdge(graphs[1], 3, 8), 1) self.assertEquals(getEdge(graphs[1], 3, 4), 1) self.assertEquals(getEdge(graphs[1], 4, 5), 1) self.assertEquals(getEdge(graphs[1], 4, 16), 1) self.assertEquals(getEdge(graphs[1], 5, 6), 1) self.assertEquals(getEdge(graphs[1], 5, 17), 1) self.assertEquals(getEdge(graphs[1], 6, 7), 1) self.assertEquals(getEdge(graphs[1], 6, 18), 1) self.assertEquals(getEdge(graphs[1], 7, 8), 1) self.assertEquals(getEdge(graphs[1], 8, 9), 1) self.assertEquals(getEdge(graphs[1], 9, 10), 1) self.assertEquals(getEdge(graphs[1], 9, 11), 1) self.assertEquals(getEdge(graphs[1], 10, 19), 1) self.assertEquals(getEdge(graphs[1], 11, 12), 1) self.assertEquals(getEdge(graphs[1], 11, 13), 1)
def testGetDataDir(self): print((PathDefaults.getDataDir()))
def testReadFromFile(self): vertex1Indices = [0, 2, 3, 4, 5] vertex2Indices = [1, 6, 7, 8, 9] def genderConv(x): genderDict = {'"M"': 0, '"F"': 1} return genderDict[x] def orientConv(x): orientDict = {'"HT"': 0, '"HB"': 1} return orientDict[x] converters = {2: genderConv, 6: genderConv, 3:orientConv, 7:orientConv} csvGraphReader = CsvGraphReader(vertex1Indices, vertex2Indices, converters) dir = PathDefaults.getDataDir() fileName = dir + "test/infect5.csv" graph = csvGraphReader.readFromFile(fileName) self.assertTrue((graph.getVertex(0) == numpy.array([0, 0, 28, 1])).all()) self.assertTrue((graph.getVertex(1) == numpy.array([1, 0, 26, 1])).all()) self.assertTrue((graph.getVertex(2) == numpy.array([0, 1, 42, 2])).all()) self.assertTrue((graph.getVertex(3) == numpy.array([1, 0, 33, 1])).all()) self.assertTrue((graph.getVertex(4) == numpy.array([0, 1, 35, 37])).all()) self.assertTrue(graph.getEdge(0, 1) == 1) self.assertTrue(graph.getEdge(2, 3) == 1) self.assertTrue(graph.getEdge(4, 6) == 1) self.assertTrue(graph.getEdge(6, 7) == 1) self.assertTrue(graph.getEdge(5, 8) == 1) self.assertEquals(graph.getNumEdges(), 5) self.assertTrue(graph.isUndirected()) #Test a directed graph csvGraphReader = CsvGraphReader(vertex1Indices, vertex2Indices, converters, undirected=False) graph = csvGraphReader.readFromFile(fileName) self.assertTrue(graph.getEdge(1, 0) == None) self.assertTrue(graph.getEdge(3, 2) == None) self.assertTrue(graph.getEdge(6, 4) == None) self.assertTrue(graph.getEdge(7, 6) == None) self.assertTrue(graph.getEdge(8, 5) == None) self.assertEquals(graph.getNumEdges(), 5) self.assertFalse(graph.isUndirected()) #Test graph with no vertex information vertex1Indices = [0] vertex2Indices = [1] fileName = dir + "test/infect5-0.csv" csvGraphReader = CsvGraphReader(vertex1Indices, vertex2Indices, {}) graph = csvGraphReader.readFromFile(fileName) self.assertTrue(graph.getEdge(0, 1) == 1) self.assertTrue(graph.getEdge(2, 3) == 1) self.assertTrue(graph.getEdge(4, 6) == 1) self.assertTrue(graph.getEdge(6, 7) == 1) self.assertTrue(graph.getEdge(5, 8) == 1) self.assertEquals(graph.getNumEdges(), 5) self.assertTrue(graph.isUndirected()) self.assertEquals(graph.getVertexList().getNumFeatures(), 0)