def corr(x, y=None, method=None): """ Compute the correlation (matrix) for the input RDD(s) using the specified method. Methods currently supported: I{pearson (default), spearman}. If a single RDD of Vectors is passed in, a correlation matrix comparing the columns in the input RDD is returned. Use C{method=} to specify the method to be used for single RDD inout. If two RDDs of floats are passed in, a single float is returned. >>> x = sc.parallelize([1.0, 0.0, -2.0], 2) >>> y = sc.parallelize([4.0, 5.0, 3.0], 2) >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2) >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7 True >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson") True >>> Statistics.corr(x, y, "spearman") 0.5 >>> from math import isnan >>> isnan(Statistics.corr(x, zeros)) True >>> from pyspark.mllib.linalg import Vectors >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), ... Vectors.dense([6, 7, 0, 8]), Vectors.dense([9, 0, 0, 1])]) >>> pearsonCorr = Statistics.corr(rdd) >>> print str(pearsonCorr).replace('nan', 'NaN') [[ 1. 0.05564149 NaN 0.40047142] [ 0.05564149 1. NaN 0.91359586] [ NaN NaN 1. NaN] [ 0.40047142 0.91359586 NaN 1. ]] >>> spearmanCorr = Statistics.corr(rdd, method="spearman") >>> print str(spearmanCorr).replace('nan', 'NaN') [[ 1. 0.10540926 NaN 0.4 ] [ 0.10540926 1. NaN 0.9486833 ] [ NaN NaN 1. NaN] [ 0.4 0.9486833 NaN 1. ]] >>> try: ... Statistics.corr(rdd, "spearman") ... print "Method name as second argument without 'method=' shouldn't be allowed." ... except TypeError: ... pass """ sc = x.ctx # Check inputs to determine whether a single value or a matrix is needed for output. # Since it's legal for users to use the method name as the second argument, we need to # check if y is used to specify the method name instead. if type(y) == str: raise TypeError("Use 'method=' to specify method name.") jx = _to_java_object_rdd(x) if not y: resultMat = sc._jvm.PythonMLLibAPI().corr(jx, method) bytes = sc._jvm.SerDe.dumps(resultMat) ser = PickleSerializer() return ser.loads(str(bytes)).toArray() else: jy = _to_java_object_rdd(y) return sc._jvm.PythonMLLibAPI().corr(jx, jy, method)
def test_group_by_key(self): def gen_data(N, step): for i in range(1, N + 1, step): for j in range(i): yield (i, [j]) def gen_gs(N, step=1): return shuffle.GroupByKey(gen_data(N, step)) self.assertEqual(1, len(list(gen_gs(1)))) self.assertEqual(2, len(list(gen_gs(2)))) self.assertEqual(100, len(list(gen_gs(100)))) self.assertEqual(list(range(1, 101)), [k for k, _ in gen_gs(100)]) self.assertTrue( all(list(range(k)) == list(vs) for k, vs in gen_gs(100))) for k, vs in gen_gs(50002, 10000): self.assertEqual(k, len(vs)) self.assertEqual(list(range(k)), list(vs)) ser = PickleSerializer() l = ser.loads(ser.dumps(list(gen_gs(50002, 30000)))) for k, vs in l: self.assertEqual(k, len(vs)) self.assertEqual(list(range(k)), list(vs))
def test_group_by_key(self): def gen_data(N, step): for i in range(1, N + 1, step): for j in range(i): yield (i, [j]) def gen_gs(N, step=1): return shuffle.GroupByKey(gen_data(N, step)) self.assertEqual(1, len(list(gen_gs(1)))) self.assertEqual(2, len(list(gen_gs(2)))) self.assertEqual(100, len(list(gen_gs(100)))) self.assertEqual(list(range(1, 101)), [k for k, _ in gen_gs(100)]) self.assertTrue(all(list(range(k)) == list(vs) for k, vs in gen_gs(100))) for k, vs in gen_gs(50002, 10000): self.assertEqual(k, len(vs)) self.assertEqual(list(range(k)), list(vs)) ser = PickleSerializer() l = ser.loads(ser.dumps(list(gen_gs(50002, 30000)))) for k, vs in l: self.assertEqual(k, len(vs)) self.assertEqual(list(range(k)), list(vs))
def __init__(self, vertex_jrdd, edge_jrdd, partition_strategy=PartitionStrategy.EdgePartition1D): self._vertex_jrdd = VertexRDD(vertex_jrdd, vertex_jrdd.context, BatchedSerializer(PickleSerializer())) self._edge_jrdd = EdgeRDD(edge_jrdd, edge_jrdd.context, BatchedSerializer(PickleSerializer())) self._partition_strategy = partition_strategy self._jsc = vertex_jrdd.context
def _svm_to_java_object_rdd(rdd): """ Return a JavaRDD of Object by unpickling It will convert each Python object into Java object by Pyrolite, whenever the RDD is serialized in batch or not. """ rdd = rdd._reserialize(AutoBatchedSerializer(PickleSerializer())) return rdd.ctx._jvm.org.apache.spark.mllib.api.python.LTRSerDe.pythonToJava( rdd._jrdd, True)
def serialize(f): ser = PickleSerializer() @wraps(f) def func(self): jvec = f(self) bytes = self._sc._jvm.SerDe.dumps(jvec) return ser.loads(str(bytes)).toArray() return func
def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) vectors. This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can handle all kinds of discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a 0-1 vector, it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). @param data: RDD of NumPy vectors, one per element, where the first coordinate is the label and the rest is the feature vector (e.g. a count vector). @param lambda_: The smoothing parameter """ sc = data.context jlist = sc._jvm.PythonMLLibAPI().trainNaiveBayes(data._to_java_object_rdd(), lambda_) labels, pi, theta = PickleSerializer().loads(str(sc._jvm.SerDe.dumps(jlist))) return NaiveBayesModel(labels.toArray(), pi.toArray(), numpy.array(theta))
def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) vectors. This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can handle all kinds of discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a 0-1 vector, it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). :param data: RDD of NumPy vectors, one per element, where the first coordinate is the label and the rest is the feature vector (e.g. a count vector). :param lambda_: The smoothing parameter """ sc = data.context jlist = sc._jvm.PythonMLLibAPI().trainNaiveBayes( data._to_java_object_rdd(), lambda_) labels, pi, theta = PickleSerializer().loads( str(sc._jvm.SerDe.dumps(jlist))) return NaiveBayesModel(labels.toArray(), pi.toArray(), numpy.array(theta))
def _svm_py2java(sc, obj): """ Convert Python object into Java """ if isinstance(obj, RDD): obj = _svm_to_java_object_rdd(obj) elif isinstance(obj, DataFrame): obj = obj._jdf elif isinstance(obj, SparkContext): obj = obj._jsc elif isinstance(obj, list): obj = [_py2java(sc, x) for x in obj] elif isinstance(obj, JavaObject): pass elif isinstance(obj, (int, long, float, bool, bytes, unicode)): pass else: data = bytearray(PickleSerializer().dumps(obj)) obj = sc._jvm.org.apache.spark.mllib.api.python.LTRSerDe.loads(data) return obj
def set_sc(cls, master=None, appName=None, sparkHome=None, pyFiles=None, environment=None, batchSize=0, serializer=PickleSerializer(), conf=None, gateway=None, jsc=None, profiler_cls=BasicProfiler): """Creates and initializes a new `SparkContext` (the old one will be stopped). Argument signature is copied from `pyspark.SparkContext <https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkContext>`_. """ if cls.sc is not None: cls.sc.stop() cls.sc = SparkContext(master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls) cls.__init_spark()
def __init__(self, jrdd, jrdd_deserializer=AutoBatchedSerializer(PickleSerializer())): """ Constructor :param jrdd: A JavaRDD reference passed from the parent RDD object :param jrdd_deserializer: The deserializer used in Python workers created from PythonRDD to execute a serialized Python function and RDD """ self.name = "VertexRDD" self.jrdd = jrdd self.is_cached = False self.is_checkpointed = False self.ctx = SparkContext._active_spark_context self.jvertex_rdd_deserializer = jrdd_deserializer self.id = jrdd.id() self.partitionFunc = None self.bypass_serializer = False self.preserve_partitioning = False self.jvertex_rdd = self.getJavaVertexRDD(jrdd, jrdd_deserializer)
myAccum = sc.accumulator(3, MyAccumulatorParam()) myAccum.add(25) print(myAccum.value) # 28 print("------------MarshalSerializer---PickleSerializer------StatusTracker--" "---SparkJobInfo-----SparkStageInfo----") from pyspark.serializers import FramedSerializer, CloudPickleSerializer # MarshalSerializer,PickleSerializer 在 pyspark有快捷引入,pyspark.serializers包含所有的 # MarshalSerializer 使用Python的Marshal序列化器序列化对象 此序列化器比PickleSerializer更快,但支持的数据类型更少。 x = MarshalSerializer() # 这个序列化器几乎支持任何Python对象,但速度可能不如更专门的序列化器快。 x2 = PickleSerializer() # sc = SparkContext("local", "serialization app", serializer = MarshalSerializer()) # 低级状态报告API sta = sc.statusTracker() print(sta) print("------") print(sta.getActiveJobsIds()) # 返回包含所有活动作业的id的数组。 print(sta.getActiveStageIds()) # 返回包含所有活动阶段的id的数组。 print(sta.getJobIdsForGroup() ) # 返回特定作业组中所有已知作业的列表。 如果jobGroup为None,则返回未与作业组关联的所有已知作业。 # print(sta.getJobInfo(JobsId)) # 返回一个SparkJobInfo对象,如果找不到或被垃圾回收,则返回None。 # print(sta.getStageInfo(stageId)) # 返回SparkStageInfo对象,如果无法找到舞台信息或被垃圾收集,则返回None。 print("--Profiler---BasicProfiler----------")