Exemple #1
0
 def test_serialize(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = array([1., 2., 3., 4.])
     lst = [1, 2, 3, 4]
     self.assertTrue(sv is _convert_vector(sv))
     self.assertTrue(dv is _convert_vector(dv))
     self.assertTrue(array_equal(dv, _convert_vector(lst)))
     self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(sv)))
     self.assertTrue(array_equal(dv, _deserialize_double_vector(_serialize_double_vector(dv))))
     self.assertTrue(array_equal(dv, _deserialize_double_vector(_serialize_double_vector(lst))))
Exemple #2
0
 def test_serialize(self):
     sv = SparseVector(4, {1: 1, 3: 2})
     dv = array([1., 2., 3., 4.])
     lst = [1, 2, 3, 4]
     self.assertTrue(sv is _convert_vector(sv))
     self.assertTrue(dv is _convert_vector(dv))
     self.assertTrue(array_equal(dv, _convert_vector(lst)))
     self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(sv)))
     self.assertTrue(array_equal(dv, _deserialize_double_vector(_serialize_double_vector(dv))))
     self.assertTrue(array_equal(dv, _deserialize_double_vector(_serialize_double_vector(lst))))
Exemple #3
0
 def test_serialize(self):
     from scipy.sparse import lil_matrix
     lil = lil_matrix((4, 1))
     lil[1, 0] = 1
     lil[3, 0] = 2
     sv = SparseVector(4, {1: 1, 3: 2})
     self.assertEquals(sv, _convert_vector(lil))
     self.assertEquals(sv, _convert_vector(lil.tocsc()))
     self.assertEquals(sv, _convert_vector(lil.tocoo()))
     self.assertEquals(sv, _convert_vector(lil.tocsr()))
     self.assertEquals(sv, _convert_vector(lil.todok()))
     self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil)))
     self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsc())))
     self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsr())))
     self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.todok())))
Exemple #4
0
 def test_serialize(self):
     from scipy.sparse import lil_matrix
     lil = lil_matrix((4, 1))
     lil[1, 0] = 1
     lil[3, 0] = 2
     sv = SparseVector(4, {1: 1, 3: 2})
     self.assertEquals(sv, _convert_vector(lil))
     self.assertEquals(sv, _convert_vector(lil.tocsc()))
     self.assertEquals(sv, _convert_vector(lil.tocoo()))
     self.assertEquals(sv, _convert_vector(lil.tocsr()))
     self.assertEquals(sv, _convert_vector(lil.todok()))
     self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil)))
     self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsc())))
     self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsr())))
     self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.todok())))
Exemple #5
0
    def poissonVectorRDD(sc,
                         mean,
                         numRows,
                         numCols,
                         numPartitions=None,
                         seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the Poisson distribution with the input mean.

        >>> import numpy as np
        >>> mean = 100.0
        >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
        >>> mat = np.mat(rdd.collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(mat.std() - sqrt(mean)) < 0.5
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI() \
            .poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed)
        poisson = RDD(jrdd, sc, NoOpSerializer())
        return poisson.map(
            lambda bytes: _deserialize_double_vector(bytearray(bytes)))
    def train(cls, data, lambda_=1.0):
        """
        Train a Naive Bayes model given an RDD of (label, features) vectors.

        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
        handle all kinds of discrete data.  For example, by converting
        documents into TF-IDF vectors, it can be used for document
        classification.  By making every vector a 0-1 vector, it can also be
        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).

        @param data: RDD of NumPy vectors, one per element, where the first
               coordinate is the label and the rest is the feature vector
               (e.g. a count vector).
        @param lambda_: The smoothing parameter
        """
        sc = data.context
        dataBytes = _get_unmangled_double_vector_rdd(data)
        ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd, lambda_)
        return NaiveBayesModel(
            _deserialize_double_vector(ans[0]), _deserialize_double_vector(ans[1]), _deserialize_double_matrix(ans[2])
        )
Exemple #7
0
    def train(cls, data, lambda_=1.0):
        """
        Train a Naive Bayes model given an RDD of (label, features) vectors.

        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
        handle all kinds of discrete data.  For example, by converting
        documents into TF-IDF vectors, it can be used for document
        classification.  By making every vector a 0-1 vector, it can also be
        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).

        @param data: RDD of NumPy vectors, one per element, where the first
               coordinate is the label and the rest is the feature vector
               (e.g. a count vector).
        @param lambda_: The smoothing parameter
        """
        sc = data.context
        dataBytes = _get_unmangled_double_vector_rdd(data)
        ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd,
                                                       lambda_)
        return NaiveBayesModel(_deserialize_double_vector(ans[0]),
                               _deserialize_double_vector(ans[1]),
                               _deserialize_double_matrix(ans[2]))
Exemple #8
0
    def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the standard normal distribution.

        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - 0.0) < 0.1
        True
        >>> abs(mat.std() - 1.0) < 0.1
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
        normal = RDD(jrdd, sc, NoOpSerializer())
        return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
Exemple #9
0
    def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the uniform distribution U(0.0, 1.0).

        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
        >>> mat.shape
        (10, 10)
        >>> mat.max() <= 1.0 and mat.min() >= 0.0
        True
        >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
        4
        """
        jrdd = sc._jvm.PythonMLLibAPI().uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
        uniform = RDD(jrdd, sc, NoOpSerializer())
        return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
Exemple #10
0
    def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the uniform distribution U(0.0, 1.0).

        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
        >>> mat.shape
        (10, 10)
        >>> mat.max() <= 1.0 and mat.min() >= 0.0
        True
        >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
        4
        """
        jrdd = sc._jvm.PythonMLLibAPI() \
            .uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
        uniform = RDD(jrdd, sc, NoOpSerializer())
        return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
Exemple #11
0
    def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the standard normal distribution.

        >>> import numpy as np
        >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - 0.0) < 0.1
        True
        >>> abs(mat.std() - 1.0) < 0.1
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI() \
            .normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed)
        normal = RDD(jrdd, sc, NoOpSerializer())
        return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
Exemple #12
0
    def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None):
        """
        Generates an RDD comprised of vectors containing i.i.d. samples drawn
        from the Poisson distribution with the input mean.

        >>> import numpy as np
        >>> mean = 100.0
        >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L)
        >>> mat = np.mat(rdd.collect())
        >>> mat.shape
        (100, 100)
        >>> abs(mat.mean() - mean) < 0.5
        True
        >>> from math import sqrt
        >>> abs(mat.std() - sqrt(mean)) < 0.5
        True
        """
        jrdd = sc._jvm.PythonMLLibAPI().poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed)
        poisson = RDD(jrdd, sc, NoOpSerializer())
        return poisson.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
Exemple #13
0
 def min(self):
     return _deserialize_double_vector(self._java_summary.min())
Exemple #14
0
 def numNonzeros(self):
     return _deserialize_double_vector(self._java_summary.numNonzeros())
Exemple #15
0
 def variance(self):
     return _deserialize_double_vector(self._java_summary.variance())
 def loads(self, obj):
     return _deserialize_double_vector(bytearray(obj))
Exemple #17
0
 def min(self):
     return _deserialize_double_vector(self._java_summary.min())
Exemple #18
0
 def numNonzeros(self):
     return _deserialize_double_vector(self._java_summary.numNonzeros())
Exemple #19
0
 def variance(self):
     return _deserialize_double_vector(self._java_summary.variance())