Example #1
0
    def corr(x, y=None, method=None):
        """
        Compute the correlation (matrix) for the input RDD(s) using the
        specified method.
        Methods currently supported: I{pearson (default), spearman}.

        If a single RDD of Vectors is passed in, a correlation matrix
        comparing the columns in the input RDD is returned. Use C{method=}
        to specify the method to be used for single RDD inout.
        If two RDDs of floats are passed in, a single float is returned.

        >>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
        >>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
        >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
        >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
        True
        >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
        True
        >>> Statistics.corr(x, y, "spearman")
        0.5
        >>> from math import isnan
        >>> isnan(Statistics.corr(x, zeros))
        True
        >>> from linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
        ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
        >>> Statistics.corr(rdd)
        array([[ 1.        ,  0.05564149,         nan,  0.40047142],
               [ 0.05564149,  1.        ,         nan,  0.91359586],
               [        nan,         nan,  1.        ,         nan],
               [ 0.40047142,  0.91359586,         nan,  1.        ]])
        >>> Statistics.corr(rdd, method="spearman")
        array([[ 1.        ,  0.10540926,         nan,  0.4       ],
               [ 0.10540926,  1.        ,         nan,  0.9486833 ],
               [        nan,         nan,  1.        ,         nan],
               [ 0.4       ,  0.9486833 ,         nan,  1.        ]])
        >>> try:
        ...     Statistics.corr(rdd, "spearman")
        ...     print "Method name as second argument without 'method=' shouldn't be allowed."
        ... except TypeError:
        ...     pass
        """
        sc = x.ctx
        # Check inputs to determine whether a single value or a matrix is needed for output.
        # Since it's legal for users to use the method name as the second argument, we need to
        # check if y is used to specify the method name instead.
        if type(y) == str:
            raise TypeError("Use 'method=' to specify method name.")
        if not y:
            try:
                Xser = _get_unmangled_double_vector_rdd(x)
            except TypeError:
                raise TypeError("corr called on a single RDD not consisted of Vectors.")
            resultMat = sc._jvm.PythonMLLibAPI().corr(Xser._jrdd, method)
            return _deserialize_double_matrix(resultMat)
        else:
            xSer = _get_unmangled_rdd(x, _serialize_double)
            ySer = _get_unmangled_rdd(y, _serialize_double)
            result = sc._jvm.PythonMLLibAPI().corr(xSer._jrdd, ySer._jrdd, method)
            return result
Example #2
0
    def corr(x, y=None, method=None):
        """
        Compute the correlation (matrix) for the input RDD(s) using the
        specified method.
        Methods currently supported: I{pearson (default), spearman}.

        If a single RDD of Vectors is passed in, a correlation matrix
        comparing the columns in the input RDD is returned. Use C{method=}
        to specify the method to be used for single RDD inout.
        If two RDDs of floats are passed in, a single float is returned.

        >>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
        >>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
        >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
        >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
        True
        >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
        True
        >>> Statistics.corr(x, y, "spearman")
        0.5
        >>> from math import isnan
        >>> isnan(Statistics.corr(x, zeros))
        True
        >>> from linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
        ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
        >>> Statistics.corr(rdd)
        array([[ 1.        ,  0.05564149,         nan,  0.40047142],
               [ 0.05564149,  1.        ,         nan,  0.91359586],
               [        nan,         nan,  1.        ,         nan],
               [ 0.40047142,  0.91359586,         nan,  1.        ]])
        >>> Statistics.corr(rdd, method="spearman")
        array([[ 1.        ,  0.10540926,         nan,  0.4       ],
               [ 0.10540926,  1.        ,         nan,  0.9486833 ],
               [        nan,         nan,  1.        ,         nan],
               [ 0.4       ,  0.9486833 ,         nan,  1.        ]])
        >>> try:
        ...     Statistics.corr(rdd, "spearman")
        ...     print "Method name as second argument without 'method=' shouldn't be allowed."
        ... except TypeError:
        ...     pass
        """
        sc = x.ctx
        # Check inputs to determine whether a single value or a matrix is needed for output.
        # Since it's legal for users to use the method name as the second argument, we need to
        # check if y is used to specify the method name instead.
        if type(y) == str:
            raise TypeError("Use 'method=' to specify method name.")
        if not y:
            try:
                Xser = _get_unmangled_double_vector_rdd(x)
            except TypeError:
                raise TypeError("corr called on a single RDD not consisted of Vectors.")
            resultMat = sc._jvm.PythonMLLibAPI().corr(Xser._jrdd, method)
            return _deserialize_double_matrix(resultMat)
        else:
            xSer = _get_unmangled_rdd(x, _serialize_double)
            ySer = _get_unmangled_rdd(y, _serialize_double)
            result = sc._jvm.PythonMLLibAPI().corr(xSer._jrdd, ySer._jrdd, method)
            return result
Example #3
0
 def train(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1):
     sc = ratings.context
     ratingBytes = _get_unmangled_rdd(ratings, _serialize_rating)
     mod = sc._jvm.PythonMLLibAPI().trainALSModel(ratingBytes._jrdd, rank,
                                                  iterations, lambda_,
                                                  blocks)
     return MatrixFactorizationModel(sc, mod)
Example #4
0
 def trainImplicit(cls, ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01):
     sc = ratings.context
     ratingBytes = _get_unmangled_rdd(ratings, _serialize_rating)
     mod = sc._jvm.PythonMLLibAPI().trainImplicitALSModel(
         ratingBytes._jrdd, rank, iterations, lambda_, blocks, alpha
     )
     return MatrixFactorizationModel(sc, mod)
Example #5
0
 def trainImplicit(cls,
                   sc,
                   ratings,
                   rank,
                   iterations=5,
                   lambda_=0.01,
                   blocks=-1,
                   alpha=0.01):
     ratingBytes = _get_unmangled_rdd(ratings, _serialize_rating)
     mod = sc._jvm.PythonMLLibAPI().trainImplicitALSModel(
         ratingBytes._jrdd, rank, iterations, lambda_, blocks, alpha)
     return MatrixFactorizationModel(sc, mod)
 def train(cls, sc, ratings, rank, iterations=5, lambda_=0.01, blocks=-1):
     ratingBytes = _get_unmangled_rdd(ratings, _serialize_rating)
     mod = sc._jvm.PythonMLLibAPI().trainALSModel(ratingBytes._jrdd,
             rank, iterations, lambda_, blocks)
     return MatrixFactorizationModel(sc, mod)
 def predictAll(self, usersProducts):
     usersProductsJRDD = _get_unmangled_rdd(usersProducts, _serialize_tuple)
     return RDD(self._java_model.predict(usersProductsJRDD._jrdd),
                self._context, RatingDeserializer())
Example #8
0
 def predictAll(self, usersProducts):
     usersProductsJRDD = _get_unmangled_rdd(usersProducts, _serialize_tuple)
     return RDD(self._java_model.predict(usersProductsJRDD._jrdd),
                self._context, RatingDeserializer())