def test_namedtuple(self): from collections import namedtuple from pyspark.cloudpickle import dumps, loads P = namedtuple("P", "x y") p1 = P(1, 3) p2 = loads(dumps(p1, 2)) self.assertEqual(p1, p2) P2 = loads(dumps(P)) p3 = P2(1, 3) self.assertEqual(p1, p3)
def loadMetadataAndInstance(pyspark_xgb_cls, path, sc, logger): """ Load the metadata and the instance of an xgboost.spark._SparkXGBEstimator or xgboost.spark._SparkXGBModel. :return: a tuple of (metadata, instance) """ metadata = DefaultParamsReader.loadMetadata( path, sc, expectedClassName=get_class_name(pyspark_xgb_cls)) pyspark_xgb = pyspark_xgb_cls() DefaultParamsReader.getAndSetParams(pyspark_xgb, metadata) if "serialized_callbacks" in metadata: serialized_callbacks = metadata["serialized_callbacks"] try: callbacks = cloudpickle.loads( base64.decodebytes(serialized_callbacks.encode("ascii"))) pyspark_xgb.set(pyspark_xgb.callbacks, callbacks) except Exception as e: # pylint: disable=W0703 logger.warning( f"Fails to load the callbacks param due to {e}. Please set the " "callbacks param manually for the loaded estimator.") if "init_booster" in metadata: load_path = os.path.join(path, metadata["init_booster"]) ser_init_booster = (_get_spark_session().read.parquet( load_path).collect()[0].init_booster) init_booster = deserialize_booster(ser_init_booster) pyspark_xgb.set(pyspark_xgb.xgb_model, init_booster) pyspark_xgb._resetUid(metadata["uid"]) # pylint: disable=protected-access return metadata, pyspark_xgb
def receive(self, sock): """ Receive a message on ``sock`` Args: sock: Returns: """ msg = None data = b"" recv_done = False recv_len = -1 while not recv_done: buf = sock.recv(BUFSIZE) if buf is None or len(buf) == 0: raise Exception("socket closed") if recv_len == -1: recv_len = struct.unpack(">I", buf[:4])[0] data += buf[4:] recv_len -= len(data) else: data += buf recv_len -= len(buf) recv_done = recv_len == 0 msg = cloudpickle.loads(data) return msg
def run_on_worker_and_fetch_result(): # TODO: handle possible spark exception here. self._spark.sparkContext.setJobGroup(self._job_group, "joblib spark jobs") ser_res = self._spark.sparkContext.parallelize([0], 1) \ .map(lambda _: cloudpickle.dumps(func())) \ .first() return cloudpickle.loads(ser_res)
def run_on_worker_and_fetch_result(): # TODO: handle possible spark exception here. # pylint: disable=fixme rdd = self._spark.sparkContext.parallelize([0], 1) \ .map(lambda _: cloudpickle.dumps(func())) if VersionUtils.majorMinorVersion(pyspark.__version__)[0] < 3: ser_res = rdd.collect()[0] else: ser_res = rdd.collectWithJobGroup(self._job_group, "joblib spark jobs")[0] return cloudpickle.loads(ser_res)
def __init__(self, jprogress: JavaObject) -> None: from pyspark import SparkContext self._jprogress: JavaObject = jprogress self._id: uuid.UUID = uuid.UUID(jprogress.id().toString()) self._runId: uuid.UUID = uuid.UUID(jprogress.runId().toString()) self._name: Optional[str] = jprogress.name() self._timestamp: str = jprogress.timestamp() self._batchId: int = jprogress.batchId() self._batchDuration: int = jprogress.batchDuration() self._durationMs: Dict[str, int] = dict(jprogress.durationMs()) self._eventTime: Dict[str, str] = dict(jprogress.eventTime()) self._stateOperators: List[StateOperatorProgress] = [ StateOperatorProgress(js) for js in jprogress.stateOperators() ] self._sources: List[SourceProgress] = [SourceProgress(js) for js in jprogress.sources()] self._sink: SinkProgress = SinkProgress(jprogress.sink()) self._observedMetrics: Dict[str, Row] = { k: cloudpickle.loads( SparkContext._jvm.PythonSQLUtils.toPyRow(jr) # type: ignore[union-attr] ) for k, jr in dict(jprogress.observedMetrics()).items() }
def loads(self, obj, encoding="bytes"): return cloudpickle.loads(obj, encoding=encoding)
def run_on_worker_and_fetch_result(): # TODO: handle possible spark exception here. # pylint: disable=fixme rdd = self._spark.sparkContext.parallelize([0], 1) \ .map(lambda _: cloudpickle.dumps(func())) ser_res = rdd.collect()[0] return cloudpickle.loads(ser_res)
import sys from os import path from pyspark import cloudpickle if len(sys.argv) == 1: export_file = path.join(path.dirname(sys.argv[0]), "call_vizier_udf.pickle") elif len(sys.argv) == 2: export_file = sys.argv[1] else: print("usage: python3 export_vizier_udf.py [target_file]") exit(-1) test_case = """ @vizierdb.export_module_decorator def apply_foo(a): return a + 1 """ with open(export_file, "rb") as in_file: invoke_vizier_udf = cloudpickle.loads(in_file.read()) assert (invoke_vizier_udf(test_case, 1) == 2) print("Success!")