Example #1
0
    def centerRows(X, mu=None, inds=None):
        """
        Simply subtract the mean value of a row from each non-zero element.
        """
        if inds == None:
            rowInds, colInds = X.nonzero()
        else:
            rowInds, colInds = inds

        rowInds = numpy.array(rowInds, numpy.int32)
        colInds = numpy.array(colInds, numpy.int32)

        if mu == None:
            #This is the mean of the nonzero values in each row
            nonZeroCounts = numpy.bincount(rowInds, minlength=X.shape[0])
            inds = nonZeroCounts == 0
            nonZeroCounts += inds
            #This is required because when we do X.sum(1) for centering it uses the same
            #dtype as X to store the sum, and this can result in overflow for e.g. uint8
            if X.dtype == numpy.uint8:
                sumCol = SparseUtilsCython.sumCols(
                    rowInds,
                    numpy.array(X[rowInds, colInds]).flatten(), X.shape[0])
            else:
                sumCol = numpy.array(X.sum(1)).flatten()
            mu = sumCol / nonZeroCounts
            mu[inds] = 0

        vals = SparseUtilsCython.partialOuterProduct(
            rowInds, colInds, numpy.array(mu, numpy.float),
            numpy.ones(X.shape[1]))
        X[X.nonzero()] = numpy.array(X[X.nonzero()] - vals, numpy.float)

        return X, mu
Example #2
0
    def centerRows(X, mu=None, inds=None):
        """
        Simply subtract the mean value of a row from each non-zero element.
        """
        if inds == None:
            rowInds, colInds = X.nonzero()
        else:
            rowInds, colInds = inds

        rowInds = numpy.array(rowInds, numpy.int32)
        colInds = numpy.array(colInds, numpy.int32)

        if mu == None:
            #This is the mean of the nonzero values in each row
            nonZeroCounts = numpy.bincount(rowInds, minlength=X.shape[0])
            inds = nonZeroCounts==0
            nonZeroCounts += inds
            #This is required because when we do X.sum(1) for centering it uses the same
            #dtype as X to store the sum, and this can result in overflow for e.g. uint8
            if X.dtype == numpy.uint8:
                sumCol = SparseUtilsCython.sumCols(rowInds, numpy.array(X[rowInds, colInds]).flatten(), X.shape[0])
            else:
                sumCol = numpy.array(X.sum(1)).flatten()
            mu = sumCol/nonZeroCounts
            mu[inds] = 0

        vals = SparseUtilsCython.partialOuterProduct(rowInds, colInds, numpy.array(mu, numpy.float), numpy.ones(X.shape[1]))
        X[X.nonzero()] = numpy.array(X[X.nonzero()] - vals, numpy.float)

        return X, mu
Example #3
0
 def testSumCols(self): 
     A = scipy.sparse.rand(10, 15, 0.5)*10
     A = scipy.sparse.csc_matrix(A, dtype=numpy.uint8)
     
     rowInds, colInds = A.nonzero()  
     rowInds = numpy.array(rowInds, numpy.int32)
     colInds = numpy.array(colInds, numpy.int32)
     
     sumCol = SparseUtilsCython.sumCols(rowInds, numpy.array(A[rowInds, colInds]).flatten(), A.shape[0])
     nptst.assert_array_equal(numpy.array(A.sum(1)).flatten(), sumCol)