コード例 #1
0
def toseries(y):

    if type(y) is ndarray:
        y = fromarray(y)
    elif type(y) is BoltArraySpark:
        y = fromrdd(y.tordd())

    return y
コード例 #2
0
    def score(self, X, y):

        y = toseries(y)

        if y.mode == "spark":
            if not self.models.mode == "spark":
                raise ValueError("model is spark mode, input y must also be spark mode")
            joined = self.models.tordd().join(y.tordd())
            result = joined.mapValues(lambda v: array([v[0][0].score(X, v[1])]))
            return fromrdd(result, shape=self.models.shape)

        if y.mode == "local":
            if not self.models.mode == "local":
                raise ValueError("mode is local mode, input y must also be local mode")
            return self.models.map(lambda kv: kv[1][0].score(X, y.values[kv[0]]), with_keys=True)
コード例 #3
0
    def score(self, X, y):

        y = toseries(y)

        if y.mode == "spark":
            if not self.models.mode == "spark":
                raise ValueError(
                    "model is spark mode, input y must also be spark mode")
            joined = self.models.tordd().join(y.tordd())
            result = joined.mapValues(
                lambda v: array([v[0][0].score(X, v[1])]))
            return fromrdd(result, shape=self.models.shape)

        if y.mode == "local":
            if not self.models.mode == "local":
                raise ValueError(
                    "mode is local mode, input y must also be local mode")
            return self.models.map(
                lambda kv: kv[1][0].score(X, y.values[kv[0]]), with_keys=True)
コード例 #4
0
    def predict_and_score(self, X, y):

        y = toseries(y)

        def get_both(model, X, y):
            return r_[model.predict(X), model.score(X, y)]

        if y.mode == "spark":
            if not self.models.mode == "spark":
                raise ValueError("model is spark mode, input y must also be spark mode")
            joined = self.models.tordd().join(y.tordd())
            both = fromrdd(joined.mapValues(lambda v: get_both(v[0][0], X, v[1])))


        if y.mode == "local":
            if not self.models.mode == "local":
                raise ValueError("mode is local mode, input y must also be local mode")
            both = self.models.map(lambda kv: get_both(kv[1][0], X, y.values[kv[0]]), with_keys=True)

        return both 
コード例 #5
0
    def predict_and_score(self, X, y):

        y = toseries(y)

        def get_both(model, X, y):
            return r_[model.predict(X), model.score(X, y)]

        if y.mode == "spark":
            if not self.models.mode == "spark":
                raise ValueError(
                    "model is spark mode, input y must also be spark mode")
            joined = self.models.tordd().join(y.tordd())
            both = fromrdd(
                joined.mapValues(lambda v: get_both(v[0][0], X, v[1])))

        if y.mode == "local":
            if not self.models.mode == "local":
                raise ValueError(
                    "mode is local mode, input y must also be local mode")
            both = self.models.map(
                lambda kv: get_both(kv[1][0], X, y.values[kv[0]]),
                with_keys=True)

        return both
コード例 #6
0
    def _fit_spark(self, data):

        from numpy import add, any, diag, dot, inf, maximum, outer, sqrt, apply_along_axis
        from numpy.linalg import inv, norm, pinv
        from thunder.series import fromrdd

        mat = data.tordd()

        # a helper function to take the Frobenius norm of two zippable RDDs
        def rddFrobeniusNorm(A, B):
            return sqrt(A.zip(B).map(lambda kvA, kvB: sum((kvA[1]-kvB[1]) ** 2)).reduce(add))

        # input checking
        k = self.k
        if k < 1:
            raise ValueError("Supplied k must be greater than 1.")

        # initialize NMF and begin als algorithm
        m = mat.values().first().size
        als_iter = 0
        h_conv_curr = 100

        random.seed(self.seed)
        h = random.rand(k, m)
        w = None

        # goal is to solve R = WH subject to all entries of W,H >= 0
        # by iteratively updating W and H with least squares and clipping negative values
        while (als_iter < self.max_iter) and (h_conv_curr > self.tol):
            # update values on iteration
            h_old = h
            w_old = w

            # precompute pinv(H) = inv(H' x H) * H' (easy here because h is an np array)
            # the rows of H should be a basis of dimension k, so in principle we could just compute directly
            p_inv_h = pinv(h)

            # update W using least squares row-wise with R * pinv(H); then clip negative values to 0
            w = mat.mapValues(lambda x: dot(x, p_inv_h))

            # clip negative values of W
            # noinspection PyUnresolvedReferences
            w = w.mapValues(lambda x: maximum(x, 0))

            # precompute inv(W' * W) to get inv_gramian_w, a np array
            # We have chosen k to be small, i.e., rank(W) = k, so W'*W is invertible
            gramian_w = w.values().map(lambda x: outer(x, x)).reduce(add)
            inv_gramian_w = inv(gramian_w)

            # pseudoinverse of W is inv(W' * W) * W' = inv_gramian_w * w
            p_inv_w = w.mapValues(lambda x: dot(inv_gramian_w, x))

            # update H using least squares row-wise with inv(W' * W) * W * R (same as pinv(W) * R)
            h = p_inv_w.values().zip(mat.values()).map(lambda v: outer(v[0], v[1])).reduce(add)

            # clip negative values of H
            # noinspection PyUnresolvedReferences
            h = maximum(h, 0)

            # normalize the rows of H
            # noinspection PyUnresolvedReferences
            h = dot(diag(1 / maximum(apply_along_axis(norm, 1, h), 0.001)), h)

            # estimate convergence
            h_conv_curr = norm(h-h_old)

            # increment count
            als_iter += 1

        shape = (data.shape[0], self.k)
        w = fromrdd(w, nrecords=data.shape[0], shape=shape, dtype=h.dtype)
        return w.values, h
コード例 #7
0
ファイル: NMF.py プロジェクト: soul-an/thunder-factorization
    def _fit_spark(self, data):

        from numpy import add, any, diag, dot, inf, maximum, outer, sqrt, apply_along_axis
        from numpy.linalg import inv, norm, pinv
        from thunder.series import fromrdd

        mat = data.tordd()

        # a helper function to take the Frobenius norm of two zippable RDDs
        def rddFrobeniusNorm(A, B):
            return sqrt(
                A.zip(B).map(lambda kvA, kvB: sum(
                    (kvA[1] - kvB[1])**2)).reduce(add))

        # input checking
        k = self.k
        if k < 1:
            raise ValueError("Supplied k must be greater than 1.")

        # initialize NMF and begin als algorithm
        m = mat.values().first().size
        als_iter = 0
        h_conv_curr = 100

        random.seed(self.seed)
        h = random.rand(k, m)
        w = None

        # goal is to solve R = WH subject to all entries of W,H >= 0
        # by iteratively updating W and H with least squares and clipping negative values
        while (als_iter < self.max_iter) and (h_conv_curr > self.tol):
            # update values on iteration
            h_old = h
            w_old = w

            # precompute pinv(H) = inv(H' x H) * H' (easy here because h is an np array)
            # the rows of H should be a basis of dimension k, so in principle we could just compute directly
            p_inv_h = pinv(h)

            # update W using least squares row-wise with R * pinv(H); then clip negative values to 0
            w = mat.mapValues(lambda x: dot(x, p_inv_h))

            # clip negative values of W
            # noinspection PyUnresolvedReferences
            w = w.mapValues(lambda x: maximum(x, 0))

            # precompute inv(W' * W) to get inv_gramian_w, a np array
            # We have chosen k to be small, i.e., rank(W) = k, so W'*W is invertible
            gramian_w = w.values().map(lambda x: outer(x, x)).reduce(add)
            inv_gramian_w = inv(gramian_w)

            # pseudoinverse of W is inv(W' * W) * W' = inv_gramian_w * w
            p_inv_w = w.mapValues(lambda x: dot(inv_gramian_w, x))

            # update H using least squares row-wise with inv(W' * W) * W * R (same as pinv(W) * R)
            h = p_inv_w.values().zip(
                mat.values()).map(lambda v: outer(v[0], v[1])).reduce(add)

            # clip negative values of H
            # noinspection PyUnresolvedReferences
            h = maximum(h, 0)

            # normalize the rows of H
            # noinspection PyUnresolvedReferences
            h = dot(diag(1 / maximum(apply_along_axis(norm, 1, h), 0.001)), h)

            # estimate convergence
            h_conv_curr = norm(h - h_old)

            # increment count
            als_iter += 1

        shape = (data.shape[0], self.k)
        w = fromrdd(w, nrecords=data.shape[0], shape=shape, dtype=h.dtype)
        return w.values, h