def _test_serialize(self, v): ser = PickleSerializer() self.assertEqual(v, ser.loads(ser.dumps(v))) jvec = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(v))) nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvec))) self.assertEqual(v, nv) vs = [v] * 100 jvecs = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(vs))) nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvecs))) self.assertEqual(vs, nvs)
def _fn_and_type(udf_column): '''Get the python function and sql date type from a spark udf column :return: A tuple of (function, dataType) ''' ser = PickleSerializer() b = udf_column._jc.expr().func().command() return ser.loads(b)
def test_als_ratings_serialize(self): ser = PickleSerializer() r = Rating(7, 1123, 3.14) jr = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(r))) nr = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jr))) self.assertEqual(r.user, nr.user) self.assertEqual(r.product, nr.product) self.assertAlmostEqual(r.rating, nr.rating, 2)
def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights): initial_weights = initial_weights or [0.0] * len(data.first().features) ser = PickleSerializer() initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights))) # use AutoBatchedSerializer before cache to reduce the memory # overhead in JVM cached = data._reserialize(AutoBatchedSerializer(ser)).cache() ans = train_func(cached._to_java_object_rdd(), initial_bytes) assert len(ans) == 2, "JVM call result had unexpected length" weights = ser.loads(str(ans[0])) return modelClass(weights, ans[1])
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"): """Train a k-means clustering model.""" sc = rdd.context ser = PickleSerializer() # cache serialized data to avoid objects over head in JVM cached = rdd.map(_convert_to_vector)._reserialize(AutoBatchedSerializer(ser)).cache() model = sc._jvm.PythonMLLibAPI().trainKMeansModel( cached._to_java_object_rdd(), k, maxIterations, runs, initializationMode) bytes = sc._jvm.SerDe.dumps(model.clusterCenters()) centers = ser.loads(str(bytes)) return KMeansModel([c.toArray() for c in centers])
def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights): initial_weights = initial_weights or [0.0] * len(data.first().features) ser = PickleSerializer() initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights))) # use AutoBatchedSerializer before cache to reduce the memory # overhead in JVM cached = data._reserialize(AutoBatchedSerializer(ser)).cache() ans = train_func(_to_java_object_rdd(cached), initial_bytes) assert len(ans) == 2, "JVM call result had unexpected length" weights = ser.loads(str(ans[0])) return modelClass(weights, ans[1])
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"): """Train a k-means clustering model.""" sc = rdd.context ser = PickleSerializer() # cache serialized data to avoid objects over head in JVM cached = rdd.map(_convert_to_vector)._reserialize( AutoBatchedSerializer(ser)).cache() model = sc._jvm.PythonMLLibAPI().trainKMeansModel( _to_java_object_rdd(cached), k, maxIterations, runs, initializationMode) bytes = sc._jvm.SerDe.dumps(model.clusterCenters()) centers = ser.loads(str(bytes)) return KMeansModel([c.toArray() for c in centers])
def findSynonyms(self, x, num): """ :param x: a word or a vector representation of word :param num: number of synonyms to find :return: array of (word, cosineSimilarity) Find synonyms of a word Note: local use only """ # TODO: make findSynonyms usable in RDD operations from python side ser = PickleSerializer() if type(x) == str: jlist = self._java_model.findSynonyms(x, num) else: bytes = bytearray(ser.dumps(_convert_to_vector(x))) vec = self._sc._jvm.SerDe.loads(bytes) jlist = self._java_model.findSynonyms(vec, num) words, similarity = ser.loads(str(self._sc._jvm.SerDe.dumps(jlist))) return zip(words, similarity)