def recommend(learner): """ Take a list of coauthors and read in the complete graph into a sparse matrix X such that X_ij = k means author i has worked with j, k times. Then do matrix factorisation on the resulting methods. """ outputDir = PathDefaults.getOutputDir() + "erasm/" matrixFileName = outputDir + "Toy" numExamples = 50 numFolds = 5 X = scipy.io.mmread(matrixFileName) X = scipy.sparse.csr_matrix(X) logging.debug("Loaded matrix " + str(X.shape) + " with " + str(X.getnnz()) + " non zeros") X = X.tocsr() X = X[0:numExamples ,:] X, maxS = preprocess(X) #Take out some ratings to form a training set rowInds, colInds = X.nonzero() randInds = numpy.random.permutation(rowInds.shape[0]) indexList = Sampling.crossValidation(numFolds, rowInds.shape[0]) paramList = [] for j, (trnIdx, tstIdx) in enumerate(indexList): trainInds = randInds[trnIdx] testInds = randInds[tstIdx] trainX = SparseUtils.selectMatrix(X, rowInds[trainInds], colInds[trainInds]).tocsr() testX = SparseUtils.selectMatrix(X, rowInds[testInds], colInds[testInds]).tocsr() paramList.append((trainX, testX, learner)) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) results = pool.map(computeTestError, paramList) #results = map(computeTestError, paramList) testErrors = numpy.array(results) meanTestErrors = testErrors.mean() logging.debug("Test errors = " + str(meanTestErrors)) errorFileName = outputDir + "results_" + learner.name() numpy.savez(errorFileName, meanTestErrors) logging.debug("Saved results as " + errorFileName)
def testSelectMatrix(self): numRows = 10 numCols = 10 A = scipy.sparse.rand(numRows, numCols, 0.5, "csr") #Select first row rowInds = numpy.zeros(numCols) colInds = numpy.arange(10) newA = SparseUtils.selectMatrix(A, rowInds, colInds) for i in range(numCols): self.assertEquals(A[0, i], newA[0, i]) for i in range(1, numRows): for j in range(numCols): self.assertEquals(newA[i, j], 0)