Ejemplo n.º 1
0
    def test_namedtuple(self):
        from collections import namedtuple
        from pyspark.cloudpickle import dumps, loads

        P = namedtuple("P", "x y")
        p1 = P(1, 3)
        p2 = loads(dumps(p1, 2))
        self.assertEqual(p1, p2)

        P2 = loads(dumps(P))
        p3 = P2(1, 3)
        self.assertEqual(p1, p3)
Ejemplo n.º 2
0
    def loadMetadataAndInstance(pyspark_xgb_cls, path, sc, logger):
        """
        Load the metadata and the instance of an xgboost.spark._SparkXGBEstimator or
        xgboost.spark._SparkXGBModel.

        :return: a tuple of (metadata, instance)
        """
        metadata = DefaultParamsReader.loadMetadata(
            path, sc, expectedClassName=get_class_name(pyspark_xgb_cls))
        pyspark_xgb = pyspark_xgb_cls()
        DefaultParamsReader.getAndSetParams(pyspark_xgb, metadata)

        if "serialized_callbacks" in metadata:
            serialized_callbacks = metadata["serialized_callbacks"]
            try:
                callbacks = cloudpickle.loads(
                    base64.decodebytes(serialized_callbacks.encode("ascii")))
                pyspark_xgb.set(pyspark_xgb.callbacks, callbacks)
            except Exception as e:  # pylint: disable=W0703
                logger.warning(
                    f"Fails to load the callbacks param due to {e}. Please set the "
                    "callbacks param manually for the loaded estimator.")

        if "init_booster" in metadata:
            load_path = os.path.join(path, metadata["init_booster"])
            ser_init_booster = (_get_spark_session().read.parquet(
                load_path).collect()[0].init_booster)
            init_booster = deserialize_booster(ser_init_booster)
            pyspark_xgb.set(pyspark_xgb.xgb_model, init_booster)

        pyspark_xgb._resetUid(metadata["uid"])  # pylint: disable=protected-access
        return metadata, pyspark_xgb
Ejemplo n.º 3
0
    def receive(self, sock):
        """
        Receive a message on ``sock``

        Args:
            sock:

        Returns:

        """
        msg = None
        data = b""
        recv_done = False
        recv_len = -1
        while not recv_done:
            buf = sock.recv(BUFSIZE)
            if buf is None or len(buf) == 0:
                raise Exception("socket closed")
            if recv_len == -1:
                recv_len = struct.unpack(">I", buf[:4])[0]
                data += buf[4:]
                recv_len -= len(data)
            else:
                data += buf
                recv_len -= len(buf)
            recv_done = recv_len == 0

        msg = cloudpickle.loads(data)
        return msg
Ejemplo n.º 4
0
 def run_on_worker_and_fetch_result():
     # TODO: handle possible spark exception here.
     self._spark.sparkContext.setJobGroup(self._job_group,
                                          "joblib spark jobs")
     ser_res = self._spark.sparkContext.parallelize([0], 1) \
         .map(lambda _: cloudpickle.dumps(func())) \
         .first()
     return cloudpickle.loads(ser_res)
Ejemplo n.º 5
0
 def run_on_worker_and_fetch_result():
     # TODO: handle possible spark exception here. # pylint: disable=fixme
     rdd = self._spark.sparkContext.parallelize([0], 1) \
         .map(lambda _: cloudpickle.dumps(func()))
     if VersionUtils.majorMinorVersion(pyspark.__version__)[0] < 3:
         ser_res = rdd.collect()[0]
     else:
         ser_res = rdd.collectWithJobGroup(self._job_group, "joblib spark jobs")[0]
     return cloudpickle.loads(ser_res)
Ejemplo n.º 6
0
    def __init__(self, jprogress: JavaObject) -> None:
        from pyspark import SparkContext

        self._jprogress: JavaObject = jprogress
        self._id: uuid.UUID = uuid.UUID(jprogress.id().toString())
        self._runId: uuid.UUID = uuid.UUID(jprogress.runId().toString())
        self._name: Optional[str] = jprogress.name()
        self._timestamp: str = jprogress.timestamp()
        self._batchId: int = jprogress.batchId()
        self._batchDuration: int = jprogress.batchDuration()
        self._durationMs: Dict[str, int] = dict(jprogress.durationMs())
        self._eventTime: Dict[str, str] = dict(jprogress.eventTime())
        self._stateOperators: List[StateOperatorProgress] = [
            StateOperatorProgress(js) for js in jprogress.stateOperators()
        ]
        self._sources: List[SourceProgress] = [SourceProgress(js) for js in jprogress.sources()]
        self._sink: SinkProgress = SinkProgress(jprogress.sink())

        self._observedMetrics: Dict[str, Row] = {
            k: cloudpickle.loads(
                SparkContext._jvm.PythonSQLUtils.toPyRow(jr)  # type: ignore[union-attr]
            )
            for k, jr in dict(jprogress.observedMetrics()).items()
        }
Ejemplo n.º 7
0
 def loads(self, obj, encoding="bytes"):
     return cloudpickle.loads(obj, encoding=encoding)
Ejemplo n.º 8
0
 def run_on_worker_and_fetch_result():
     # TODO: handle possible spark exception here. # pylint: disable=fixme
     rdd = self._spark.sparkContext.parallelize([0], 1) \
         .map(lambda _: cloudpickle.dumps(func()))
     ser_res = rdd.collect()[0]
     return cloudpickle.loads(ser_res)
Ejemplo n.º 9
0
import sys
from os import path
from pyspark import cloudpickle

if len(sys.argv) == 1:
    export_file = path.join(path.dirname(sys.argv[0]),
                            "call_vizier_udf.pickle")
elif len(sys.argv) == 2:
    export_file = sys.argv[1]
else:
    print("usage: python3 export_vizier_udf.py [target_file]")
    exit(-1)

test_case = """
@vizierdb.export_module_decorator
def apply_foo(a):
    return a + 1
"""

with open(export_file, "rb") as in_file:
    invoke_vizier_udf = cloudpickle.loads(in_file.read())

assert (invoke_vizier_udf(test_case, 1) == 2)

print("Success!")