Example #1
0
    def corr(x, y=None, method=None):
        """
        Compute the correlation (matrix) for the input RDD(s) using the
        specified method.
        Methods currently supported: I{pearson (default), spearman}.

        If a single RDD of Vectors is passed in, a correlation matrix
        comparing the columns in the input RDD is returned. Use C{method=}
        to specify the method to be used for single RDD inout.
        If two RDDs of floats are passed in, a single float is returned.

        >>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
        >>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
        >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
        >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
        True
        >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
        True
        >>> Statistics.corr(x, y, "spearman")
        0.5
        >>> from math import isnan
        >>> isnan(Statistics.corr(x, zeros))
        True
        >>> from pyspark.mllib.linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
        ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
        >>> pearsonCorr = Statistics.corr(rdd)
        >>> print str(pearsonCorr).replace('nan', 'NaN')
        [[ 1.          0.05564149         NaN  0.40047142]
         [ 0.05564149  1.                 NaN  0.91359586]
         [        NaN         NaN  1.                 NaN]
         [ 0.40047142  0.91359586         NaN  1.        ]]
        >>> spearmanCorr = Statistics.corr(rdd, method="spearman")
        >>> print str(spearmanCorr).replace('nan', 'NaN')
        [[ 1.          0.10540926         NaN  0.4       ]
         [ 0.10540926  1.                 NaN  0.9486833 ]
         [        NaN         NaN  1.                 NaN]
         [ 0.4         0.9486833          NaN  1.        ]]
        >>> try:
        ...     Statistics.corr(rdd, "spearman")
        ...     print "Method name as second argument without 'method=' shouldn't be allowed."
        ... except TypeError:
        ...     pass
        """
        sc = x.ctx
        # Check inputs to determine whether a single value or a matrix is needed for output.
        # Since it's legal for users to use the method name as the second argument, we need to
        # check if y is used to specify the method name instead.
        if type(y) == str:
            raise TypeError("Use 'method=' to specify method name.")

        jx = _to_java_object_rdd(x)
        if not y:
            resultMat = sc._jvm.PythonMLLibAPI().corr(jx, method)
            bytes = sc._jvm.SerDe.dumps(resultMat)
            ser = PickleSerializer()
            return ser.loads(str(bytes)).toArray()
        else:
            jy = _to_java_object_rdd(y)
            return sc._jvm.PythonMLLibAPI().corr(jx, jy, method)
Example #2
0
    def corr(x, y=None, method=None):
        """
        Compute the correlation (matrix) for the input RDD(s) using the
        specified method.
        Methods currently supported: I{pearson (default), spearman}.

        If a single RDD of Vectors is passed in, a correlation matrix
        comparing the columns in the input RDD is returned. Use C{method=}
        to specify the method to be used for single RDD inout.
        If two RDDs of floats are passed in, a single float is returned.

        >>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
        >>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
        >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
        >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
        True
        >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
        True
        >>> Statistics.corr(x, y, "spearman")
        0.5
        >>> from math import isnan
        >>> isnan(Statistics.corr(x, zeros))
        True
        >>> from pyspark.mllib.linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
        ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
        >>> pearsonCorr = Statistics.corr(rdd)
        >>> print str(pearsonCorr).replace('nan', 'NaN')
        [[ 1.          0.05564149         NaN  0.40047142]
         [ 0.05564149  1.                 NaN  0.91359586]
         [        NaN         NaN  1.                 NaN]
         [ 0.40047142  0.91359586         NaN  1.        ]]
        >>> spearmanCorr = Statistics.corr(rdd, method="spearman")
        >>> print str(spearmanCorr).replace('nan', 'NaN')
        [[ 1.          0.10540926         NaN  0.4       ]
         [ 0.10540926  1.                 NaN  0.9486833 ]
         [        NaN         NaN  1.                 NaN]
         [ 0.4         0.9486833          NaN  1.        ]]
        >>> try:
        ...     Statistics.corr(rdd, "spearman")
        ...     print "Method name as second argument without 'method=' shouldn't be allowed."
        ... except TypeError:
        ...     pass
        """
        sc = x.ctx
        # Check inputs to determine whether a single value or a matrix is needed for output.
        # Since it's legal for users to use the method name as the second argument, we need to
        # check if y is used to specify the method name instead.
        if type(y) == str:
            raise TypeError("Use 'method=' to specify method name.")

        jx = _to_java_object_rdd(x)
        if not y:
            resultMat = sc._jvm.PythonMLLibAPI().corr(jx, method)
            bytes = sc._jvm.SerDe.dumps(resultMat)
            ser = PickleSerializer()
            return ser.loads(str(bytes)).toArray()
        else:
            jy = _to_java_object_rdd(y)
            return sc._jvm.PythonMLLibAPI().corr(jx, jy, method)
Example #3
0
    def predict(self, x):
        """
        Predict the label of one or more examples.

        :param x:  Data point (feature vector),
                   or an RDD of data points (feature vectors).
        """
        SerDe = self._sc._jvm.SerDe
        ser = PickleSerializer()
        if isinstance(x, RDD):
            # Bulk prediction
            first = x.take(1)
            if not first:
                return self._sc.parallelize([])
            if not isinstance(first[0], Vector):
                x = x.map(_convert_to_vector)
            jPred = self._java_model.predict(_to_java_object_rdd(x)).toJavaRDD()
            jpyrdd = self._sc._jvm.SerDe.javaToPython(jPred)
            return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024))

        else:
            # Assume x is a single data point.
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            return self._java_model.predict(vec)
Example #4
0
    def colStats(rdd):
        """
        Computes column-wise summary statistics for the input RDD[Vector].

        >>> from pyspark.mllib.linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
        ...                       Vectors.dense([4, 5, 0,  3]),
        ...                       Vectors.dense([6, 7, 0,  8])])
        >>> cStats = Statistics.colStats(rdd)
        >>> cStats.mean()
        array([ 4.,  4.,  0.,  3.])
        >>> cStats.variance()
        array([  4.,  13.,   0.,  25.])
        >>> cStats.count()
        3L
        >>> cStats.numNonzeros()
        array([ 3.,  2.,  0.,  3.])
        >>> cStats.max()
        array([ 6.,  7.,  0.,  8.])
        >>> cStats.min()
        array([ 2.,  0.,  0., -2.])
        """
        sc = rdd.ctx
        jrdd = _to_java_object_rdd(rdd)
        cStats = sc._jvm.PythonMLLibAPI().colStats(jrdd)
        return MultivariateStatisticalSummary(sc, cStats)
Example #5
0
    def colStats(rdd):
        """
        Computes column-wise summary statistics for the input RDD[Vector].

        >>> from pyspark.mllib.linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
        ...                       Vectors.dense([4, 5, 0,  3]),
        ...                       Vectors.dense([6, 7, 0,  8])])
        >>> cStats = Statistics.colStats(rdd)
        >>> cStats.mean()
        array([ 4.,  4.,  0.,  3.])
        >>> cStats.variance()
        array([  4.,  13.,   0.,  25.])
        >>> cStats.count()
        3L
        >>> cStats.numNonzeros()
        array([ 3.,  2.,  0.,  3.])
        >>> cStats.max()
        array([ 6.,  7.,  0.,  8.])
        >>> cStats.min()
        array([ 2.,  0.,  0., -2.])
        """
        sc = rdd.ctx
        jrdd = _to_java_object_rdd(rdd)
        cStats = sc._jvm.PythonMLLibAPI().colStats(jrdd)
        return MultivariateStatisticalSummary(sc, cStats)
Example #6
0
def _py2java(sc, a):
    """ Convert Python object into Java """
    if isinstance(a, RDD):
        a = _to_java_object_rdd(a)
    elif not isinstance(a, (int, long, float, bool, basestring)):
        bytes = bytearray(PickleSerializer().dumps(a))
        a = sc._jvm.SerDe.loads(bytes)
    return a
Example #7
0
def _py2java(sc, a):
    """ Convert Python object into Java """
    if isinstance(a, RDD):
        a = _to_java_object_rdd(a)
    elif not isinstance(a, (int, long, float, bool, basestring)):
        bytes = bytearray(PickleSerializer().dumps(a))
        a = sc._jvm.SerDe.loads(bytes)
    return a
Example #8
0
def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights):
    initial_weights = initial_weights or [0.0] * len(data.first().features)
    ser = PickleSerializer()
    initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights)))
    # use AutoBatchedSerializer before cache to reduce the memory
    # overhead in JVM
    cached = data._reserialize(AutoBatchedSerializer(ser)).cache()
    ans = train_func(_to_java_object_rdd(cached), initial_bytes)
    assert len(ans) == 2, "JVM call result had unexpected length"
    weights = ser.loads(str(ans[0]))
    return modelClass(weights, ans[1])
Example #9
0
 def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"):
     """Train a k-means clustering model."""
     sc = rdd.context
     ser = PickleSerializer()
     # cache serialized data to avoid objects over head in JVM
     cached = rdd.map(_convert_to_vector)._reserialize(AutoBatchedSerializer(ser)).cache()
     model = sc._jvm.PythonMLLibAPI().trainKMeansModel(
         _to_java_object_rdd(cached), k, maxIterations, runs, initializationMode
     )
     bytes = sc._jvm.SerDe.dumps(model.clusterCenters())
     centers = ser.loads(str(bytes))
     return KMeansModel([c.toArray() for c in centers])
Example #10
0
 def _prepare(cls, ratings):
     assert isinstance(ratings, RDD), "ratings should be RDD"
     first = ratings.first()
     if not isinstance(first, Rating):
         if isinstance(first, (tuple, list)):
             ratings = ratings.map(lambda x: Rating(*x))
         else:
             raise ValueError("rating should be RDD of Rating or tuple/list")
     # serialize them by AutoBatchedSerializer before cache to reduce the
     # objects overhead in JVM
     cached = ratings._reserialize(AutoBatchedSerializer(PickleSerializer())).cache()
     return _to_java_object_rdd(cached)
Example #11
0
 def _train(data, type, numClasses, categoricalFeaturesInfo,
            impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1,
            minInfoGain=0.0):
     first = data.first()
     assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
     sc = data.context
     jrdd = _to_java_object_rdd(data)
     cfiMap = MapConverter().convert(categoricalFeaturesInfo,
                                     sc._gateway._gateway_client)
     model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel(
         jrdd, type, numClasses, cfiMap,
         impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
     return DecisionTreeModel(sc, model)
Example #12
0
 def _prepare(cls, ratings):
     assert isinstance(ratings, RDD), "ratings should be RDD"
     first = ratings.first()
     if not isinstance(first, Rating):
         if isinstance(first, (tuple, list)):
             ratings = ratings.map(lambda x: Rating(*x))
         else:
             raise ValueError(
                 "rating should be RDD of Rating or tuple/list")
     # serialize them by AutoBatchedSerializer before cache to reduce the
     # objects overhead in JVM
     cached = ratings._reserialize(AutoBatchedSerializer(
         PickleSerializer())).cache()
     return _to_java_object_rdd(cached)
Example #13
0
 def predictAll(self, user_product):
     assert isinstance(user_product, RDD), "user_product should be RDD of (user, product)"
     first = user_product.first()
     if isinstance(first, list):
         user_product = user_product.map(tuple)
         first = tuple(first)
     assert type(first) is tuple and len(first) == 2, \
         "user_product should be RDD of (user, product)"
     if any(isinstance(x, str) for x in first):
         user_product = user_product.map(lambda (u, p): (int(x), int(p)))
         first = tuple(map(int, first))
     assert all(type(x) is int for x in first), "user and product in user_product shoul be int"
     sc = self._context
     tuplerdd = sc._jvm.SerDe.asTupleRDD(_to_java_object_rdd(user_product).rdd())
     jresult = self._java_model.predict(tuplerdd).toJavaRDD()
     return RDD(sc._jvm.SerDe.javaToPython(jresult), sc,
                AutoBatchedSerializer(PickleSerializer()))
Example #14
0
 def train(cls,
           rdd,
           k,
           maxIterations=100,
           runs=1,
           initializationMode="k-means||"):
     """Train a k-means clustering model."""
     sc = rdd.context
     ser = PickleSerializer()
     # cache serialized data to avoid objects over head in JVM
     cached = rdd.map(_convert_to_vector)._reserialize(
         AutoBatchedSerializer(ser)).cache()
     model = sc._jvm.PythonMLLibAPI().trainKMeansModel(
         _to_java_object_rdd(cached), k, maxIterations, runs,
         initializationMode)
     bytes = sc._jvm.SerDe.dumps(model.clusterCenters())
     centers = ser.loads(str(bytes))
     return KMeansModel([c.toArray() for c in centers])
Example #15
0
    def train(cls, data, lambda_=1.0):
        """
        Train a Naive Bayes model given an RDD of (label, features) vectors.

        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
        handle all kinds of discrete data.  For example, by converting
        documents into TF-IDF vectors, it can be used for document
        classification.  By making every vector a 0-1 vector, it can also be
        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).

        :param data: RDD of NumPy vectors, one per element, where the first
               coordinate is the label and the rest is the feature vector
               (e.g. a count vector).
        :param lambda_: The smoothing parameter
        """
        sc = data.context
        jlist = sc._jvm.PythonMLLibAPI().trainNaiveBayes(_to_java_object_rdd(data), lambda_)
        labels, pi, theta = PickleSerializer().loads(str(sc._jvm.SerDe.dumps(jlist)))
        return NaiveBayesModel(labels.toArray(), pi.toArray(), numpy.array(theta))
Example #16
0
    def fit(self, data):
        """
        Computes the vector representation of each word in vocabulary.

        :param data: training data. RDD of subtype of Iterable[String]
        :return: python Word2VecModel instance
        """
        sc = data.context
        ser = PickleSerializer()
        vectorSize = self.vectorSize
        learningRate = self.learningRate
        numPartitions = self.numPartitions
        numIterations = self.numIterations
        seed = self.seed

        model = sc._jvm.PythonMLLibAPI().trainWord2Vec(
            _to_java_object_rdd(data), vectorSize, learningRate, numPartitions,
            numIterations, seed)
        return Word2VecModel(sc, model)
Example #17
0
    def fit(self, data):
        """
        Computes the vector representation of each word in vocabulary.

        :param data: training data. RDD of subtype of Iterable[String]
        :return: python Word2VecModel instance
        """
        sc = data.context
        ser = PickleSerializer()
        vectorSize = self.vectorSize
        learningRate = self.learningRate
        numPartitions = self.numPartitions
        numIterations = self.numIterations
        seed = self.seed

        model = sc._jvm.PythonMLLibAPI().trainWord2Vec(
            _to_java_object_rdd(data), vectorSize,
            learningRate, numPartitions, numIterations, seed)
        return Word2VecModel(sc, model)
Example #18
0
 def predictAll(self, user_product):
     assert isinstance(user_product,
                       RDD), "user_product should be RDD of (user, product)"
     first = user_product.first()
     if isinstance(first, list):
         user_product = user_product.map(tuple)
         first = tuple(first)
     assert type(first) is tuple and len(first) == 2, \
         "user_product should be RDD of (user, product)"
     if any(isinstance(x, str) for x in first):
         user_product = user_product.map(lambda (u, p): (int(x), int(p)))
         first = tuple(map(int, first))
     assert all(
         type(x) is int
         for x in first), "user and product in user_product shoul be int"
     sc = self._context
     tuplerdd = sc._jvm.SerDe.asTupleRDD(
         _to_java_object_rdd(user_product).rdd())
     jresult = self._java_model.predict(tuplerdd).toJavaRDD()
     return RDD(sc._jvm.SerDe.javaToPython(jresult), sc,
                AutoBatchedSerializer(PickleSerializer()))
Example #19
0
    def train(cls, data, lambda_=1.0):
        """
        Train a Naive Bayes model given an RDD of (label, features) vectors.

        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
        handle all kinds of discrete data.  For example, by converting
        documents into TF-IDF vectors, it can be used for document
        classification.  By making every vector a 0-1 vector, it can also be
        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).

        :param data: RDD of NumPy vectors, one per element, where the first
               coordinate is the label and the rest is the feature vector
               (e.g. a count vector).
        :param lambda_: The smoothing parameter
        """
        sc = data.context
        jlist = sc._jvm.PythonMLLibAPI().trainNaiveBayes(
            _to_java_object_rdd(data), lambda_)
        labels, pi, theta = PickleSerializer().loads(
            str(sc._jvm.SerDe.dumps(jlist)))
        return NaiveBayesModel(labels.toArray(), pi.toArray(),
                               numpy.array(theta))