Ejemplo n.º 1
0
    def corr(x, y=None, method=None):
        """
        Compute the correlation (matrix) for the input RDD(s) using the
        specified method.
        Methods currently supported: I{pearson (default), spearman}.

        If a single RDD of Vectors is passed in, a correlation matrix
        comparing the columns in the input RDD is returned. Use C{method=}
        to specify the method to be used for single RDD inout.
        If two RDDs of floats are passed in, a single float is returned.

        >>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
        >>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
        >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
        >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
        True
        >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
        True
        >>> Statistics.corr(x, y, "spearman")
        0.5
        >>> from math import isnan
        >>> isnan(Statistics.corr(x, zeros))
        True
        >>> from pyspark.mllib.linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
        ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
        >>> pearsonCorr = Statistics.corr(rdd)
        >>> print str(pearsonCorr).replace('nan', 'NaN')
        [[ 1.          0.05564149         NaN  0.40047142]
         [ 0.05564149  1.                 NaN  0.91359586]
         [        NaN         NaN  1.                 NaN]
         [ 0.40047142  0.91359586         NaN  1.        ]]
        >>> spearmanCorr = Statistics.corr(rdd, method="spearman")
        >>> print str(spearmanCorr).replace('nan', 'NaN')
        [[ 1.          0.10540926         NaN  0.4       ]
         [ 0.10540926  1.                 NaN  0.9486833 ]
         [        NaN         NaN  1.                 NaN]
         [ 0.4         0.9486833          NaN  1.        ]]
        >>> try:
        ...     Statistics.corr(rdd, "spearman")
        ...     print "Method name as second argument without 'method=' shouldn't be allowed."
        ... except TypeError:
        ...     pass
        """
        sc = x.ctx
        # Check inputs to determine whether a single value or a matrix is needed for output.
        # Since it's legal for users to use the method name as the second argument, we need to
        # check if y is used to specify the method name instead.
        if type(y) == str:
            raise TypeError("Use 'method=' to specify method name.")

        jx = _to_java_object_rdd(x)
        if not y:
            resultMat = sc._jvm.PythonMLLibAPI().corr(jx, method)
            bytes = sc._jvm.SerDe.dumps(resultMat)
            ser = PickleSerializer()
            return ser.loads(str(bytes)).toArray()
        else:
            jy = _to_java_object_rdd(y)
            return sc._jvm.PythonMLLibAPI().corr(jx, jy, method)
Ejemplo n.º 2
0
    def test_group_by_key(self):
        def gen_data(N, step):
            for i in range(1, N + 1, step):
                for j in range(i):
                    yield (i, [j])

        def gen_gs(N, step=1):
            return shuffle.GroupByKey(gen_data(N, step))

        self.assertEqual(1, len(list(gen_gs(1))))
        self.assertEqual(2, len(list(gen_gs(2))))
        self.assertEqual(100, len(list(gen_gs(100))))
        self.assertEqual(list(range(1, 101)), [k for k, _ in gen_gs(100)])
        self.assertTrue(
            all(list(range(k)) == list(vs) for k, vs in gen_gs(100)))

        for k, vs in gen_gs(50002, 10000):
            self.assertEqual(k, len(vs))
            self.assertEqual(list(range(k)), list(vs))

        ser = PickleSerializer()
        l = ser.loads(ser.dumps(list(gen_gs(50002, 30000))))
        for k, vs in l:
            self.assertEqual(k, len(vs))
            self.assertEqual(list(range(k)), list(vs))
Ejemplo n.º 3
0
    def corr(x, y=None, method=None):
        """
        Compute the correlation (matrix) for the input RDD(s) using the
        specified method.
        Methods currently supported: I{pearson (default), spearman}.

        If a single RDD of Vectors is passed in, a correlation matrix
        comparing the columns in the input RDD is returned. Use C{method=}
        to specify the method to be used for single RDD inout.
        If two RDDs of floats are passed in, a single float is returned.

        >>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
        >>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
        >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
        >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
        True
        >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
        True
        >>> Statistics.corr(x, y, "spearman")
        0.5
        >>> from math import isnan
        >>> isnan(Statistics.corr(x, zeros))
        True
        >>> from pyspark.mllib.linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
        ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
        >>> pearsonCorr = Statistics.corr(rdd)
        >>> print str(pearsonCorr).replace('nan', 'NaN')
        [[ 1.          0.05564149         NaN  0.40047142]
         [ 0.05564149  1.                 NaN  0.91359586]
         [        NaN         NaN  1.                 NaN]
         [ 0.40047142  0.91359586         NaN  1.        ]]
        >>> spearmanCorr = Statistics.corr(rdd, method="spearman")
        >>> print str(spearmanCorr).replace('nan', 'NaN')
        [[ 1.          0.10540926         NaN  0.4       ]
         [ 0.10540926  1.                 NaN  0.9486833 ]
         [        NaN         NaN  1.                 NaN]
         [ 0.4         0.9486833          NaN  1.        ]]
        >>> try:
        ...     Statistics.corr(rdd, "spearman")
        ...     print "Method name as second argument without 'method=' shouldn't be allowed."
        ... except TypeError:
        ...     pass
        """
        sc = x.ctx
        # Check inputs to determine whether a single value or a matrix is needed for output.
        # Since it's legal for users to use the method name as the second argument, we need to
        # check if y is used to specify the method name instead.
        if type(y) == str:
            raise TypeError("Use 'method=' to specify method name.")

        jx = _to_java_object_rdd(x)
        if not y:
            resultMat = sc._jvm.PythonMLLibAPI().corr(jx, method)
            bytes = sc._jvm.SerDe.dumps(resultMat)
            ser = PickleSerializer()
            return ser.loads(str(bytes)).toArray()
        else:
            jy = _to_java_object_rdd(y)
            return sc._jvm.PythonMLLibAPI().corr(jx, jy, method)
Ejemplo n.º 4
0
    def test_group_by_key(self):

        def gen_data(N, step):
            for i in range(1, N + 1, step):
                for j in range(i):
                    yield (i, [j])

        def gen_gs(N, step=1):
            return shuffle.GroupByKey(gen_data(N, step))

        self.assertEqual(1, len(list(gen_gs(1))))
        self.assertEqual(2, len(list(gen_gs(2))))
        self.assertEqual(100, len(list(gen_gs(100))))
        self.assertEqual(list(range(1, 101)), [k for k, _ in gen_gs(100)])
        self.assertTrue(all(list(range(k)) == list(vs) for k, vs in gen_gs(100)))

        for k, vs in gen_gs(50002, 10000):
            self.assertEqual(k, len(vs))
            self.assertEqual(list(range(k)), list(vs))

        ser = PickleSerializer()
        l = ser.loads(ser.dumps(list(gen_gs(50002, 30000))))
        for k, vs in l:
            self.assertEqual(k, len(vs))
            self.assertEqual(list(range(k)), list(vs))
Ejemplo n.º 5
0
 def __init__(self, vertex_jrdd, edge_jrdd,
              partition_strategy=PartitionStrategy.EdgePartition1D):
     self._vertex_jrdd = VertexRDD(vertex_jrdd, vertex_jrdd.context,
                                   BatchedSerializer(PickleSerializer()))
     self._edge_jrdd = EdgeRDD(edge_jrdd, edge_jrdd.context,
                               BatchedSerializer(PickleSerializer()))
     self._partition_strategy = partition_strategy
     self._jsc = vertex_jrdd.context
Ejemplo n.º 6
0
def _svm_to_java_object_rdd(rdd):
    """ Return a JavaRDD of Object by unpickling

    It will convert each Python object into Java object by Pyrolite, whenever the
    RDD is serialized in batch or not.
    """
    rdd = rdd._reserialize(AutoBatchedSerializer(PickleSerializer()))
    return rdd.ctx._jvm.org.apache.spark.mllib.api.python.LTRSerDe.pythonToJava(
        rdd._jrdd, True)
Ejemplo n.º 7
0
def serialize(f):
    ser = PickleSerializer()

    @wraps(f)
    def func(self):
        jvec = f(self)
        bytes = self._sc._jvm.SerDe.dumps(jvec)
        return ser.loads(str(bytes)).toArray()

    return func
Ejemplo n.º 8
0
    def train(cls, data, lambda_=1.0):
        """
        Train a Naive Bayes model given an RDD of (label, features) vectors.

        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
        handle all kinds of discrete data.  For example, by converting
        documents into TF-IDF vectors, it can be used for document
        classification.  By making every vector a 0-1 vector, it can also be
        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).

        @param data: RDD of NumPy vectors, one per element, where the first
               coordinate is the label and the rest is the feature vector
               (e.g. a count vector).
        @param lambda_: The smoothing parameter
        """
        sc = data.context
        jlist = sc._jvm.PythonMLLibAPI().trainNaiveBayes(data._to_java_object_rdd(), lambda_)
        labels, pi, theta = PickleSerializer().loads(str(sc._jvm.SerDe.dumps(jlist)))
        return NaiveBayesModel(labels.toArray(), pi.toArray(), numpy.array(theta))
    def train(cls, data, lambda_=1.0):
        """
        Train a Naive Bayes model given an RDD of (label, features) vectors.

        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
        handle all kinds of discrete data.  For example, by converting
        documents into TF-IDF vectors, it can be used for document
        classification.  By making every vector a 0-1 vector, it can also be
        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).

        :param data: RDD of NumPy vectors, one per element, where the first
               coordinate is the label and the rest is the feature vector
               (e.g. a count vector).
        :param lambda_: The smoothing parameter
        """
        sc = data.context
        jlist = sc._jvm.PythonMLLibAPI().trainNaiveBayes(
            data._to_java_object_rdd(), lambda_)
        labels, pi, theta = PickleSerializer().loads(
            str(sc._jvm.SerDe.dumps(jlist)))
        return NaiveBayesModel(labels.toArray(), pi.toArray(),
                               numpy.array(theta))
Ejemplo n.º 10
0
def _svm_py2java(sc, obj):
    """ Convert Python object into Java """
    if isinstance(obj, RDD):
        obj = _svm_to_java_object_rdd(obj)
    elif isinstance(obj, DataFrame):
        obj = obj._jdf
    elif isinstance(obj, SparkContext):
        obj = obj._jsc
    elif isinstance(obj, list):
        obj = [_py2java(sc, x) for x in obj]
    elif isinstance(obj, JavaObject):
        pass
    elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
        pass
    else:
        data = bytearray(PickleSerializer().dumps(obj))
        obj = sc._jvm.org.apache.spark.mllib.api.python.LTRSerDe.loads(data)
    return obj
Ejemplo n.º 11
0
 def set_sc(cls,
            master=None,
            appName=None,
            sparkHome=None,
            pyFiles=None,
            environment=None,
            batchSize=0,
            serializer=PickleSerializer(),
            conf=None,
            gateway=None,
            jsc=None,
            profiler_cls=BasicProfiler):
     """Creates and initializes a new `SparkContext` (the old one will be stopped).
     Argument signature is copied from `pyspark.SparkContext
     <https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkContext>`_.
     """
     if cls.sc is not None:
         cls.sc.stop()
     cls.sc = SparkContext(master, appName, sparkHome, pyFiles, environment,
                           batchSize, serializer, conf, gateway, jsc,
                           profiler_cls)
     cls.__init_spark()
Ejemplo n.º 12
0
    def __init__(self,
                 jrdd,
                 jrdd_deserializer=AutoBatchedSerializer(PickleSerializer())):
        """
        Constructor
        :param jrdd:               A JavaRDD reference passed from the parent
                                   RDD object
        :param jrdd_deserializer:  The deserializer used in Python workers
                                   created from PythonRDD to execute a
                                   serialized Python function and RDD
        """

        self.name = "VertexRDD"
        self.jrdd = jrdd
        self.is_cached = False
        self.is_checkpointed = False
        self.ctx = SparkContext._active_spark_context
        self.jvertex_rdd_deserializer = jrdd_deserializer
        self.id = jrdd.id()
        self.partitionFunc = None
        self.bypass_serializer = False
        self.preserve_partitioning = False

        self.jvertex_rdd = self.getJavaVertexRDD(jrdd, jrdd_deserializer)
Ejemplo n.º 13
0
myAccum = sc.accumulator(3, MyAccumulatorParam())

myAccum.add(25)

print(myAccum.value)  # 28

print("------------MarshalSerializer---PickleSerializer------StatusTracker--"
      "---SparkJobInfo-----SparkStageInfo----")

from pyspark.serializers import FramedSerializer, CloudPickleSerializer
# MarshalSerializer,PickleSerializer 在 pyspark有快捷引入,pyspark.serializers包含所有的
# MarshalSerializer 使用Python的Marshal序列化器序列化对象 此序列化器比PickleSerializer更快,但支持的数据类型更少。
x = MarshalSerializer()
# 这个序列化器几乎支持任何Python对象,但速度可能不如更专门的序列化器快。
x2 = PickleSerializer()

# sc = SparkContext("local", "serialization app", serializer = MarshalSerializer())

# 低级状态报告API
sta = sc.statusTracker()
print(sta)
print("------")
print(sta.getActiveJobsIds())  # 返回包含所有活动作业的id的数组。
print(sta.getActiveStageIds())  # 返回包含所有活动阶段的id的数组。
print(sta.getJobIdsForGroup()
      )  # 返回特定作业组中所有已知作业的列表。 如果jobGroup为None,则返回未与作业组关联的所有已知作业。
# print(sta.getJobInfo(JobsId)) # 返回一个SparkJobInfo对象,如果找不到或被垃圾回收,则返回None。
# print(sta.getStageInfo(stageId)) # 返回SparkStageInfo对象,如果无法找到舞台信息或被垃圾收集,则返回None。

print("--Profiler---BasicProfiler----------")