Esempio n. 1
0
 def _jrdd(self):
     if self._jrdd_val:
         return self._jrdd_val
     func = self.func
     if not self._bypass_serializer and self.ctx.batchSize != 1:
         oldfunc = self.func
         batchSize = self.ctx.batchSize
         def batched_func(split, iterator):
             return batched(oldfunc(split, iterator), batchSize)
         func = batched_func
     cmds = [func, self._bypass_serializer]
     pipe_command = ' '.join(b64enc(cloudpickle.dumps(f)) for f in cmds)
     broadcast_vars = ListConverter().convert(
         [x._jbroadcast for x in self.ctx._pickled_broadcast_vars],
         self.ctx._gateway._gateway_client)
     self.ctx._pickled_broadcast_vars.clear()
     class_manifest = self._prev_jrdd.classManifest()
     env = MapConverter().convert(self.ctx.environment,
                                  self.ctx._gateway._gateway_client)
     includes = ListConverter().convert(self.ctx._python_includes,
                                  self.ctx._gateway._gateway_client)
     python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(),
         pipe_command, env, includes, self.preservesPartitioning, self.ctx.pythonExec,
         broadcast_vars, self.ctx._javaAccumulator, class_manifest)
     self._jrdd_val = python_rdd.asJavaRDD()
     return self._jrdd_val
Esempio n. 2
0
 def dumps(self, obj):
     try:
         return cloudpickle.dumps(obj, 2)
     except pickle.PickleError:
         raise
     except Exception as e:
         emsg = _exception_message(e)
         if "'i' format requires" in emsg:
             msg = "Object too large to serialize: %s" % emsg
         else:
             msg = "Could not serialize object: %s: %s" % (e.__class__.__name__, emsg)
         cloudpickle.print_exec(sys.stderr)
         raise pickle.PicklingError(msg)
Esempio n. 3
0
 def dumps(self, obj):
     try:
         return cloudpickle.dumps(obj, 2)
     except pickle.PickleError:
         raise
     except Exception as e:
         emsg = _exception_message(e)
         if "'i' format requires" in emsg:
             msg = "Object too large to serialize: %s" % emsg
         else:
             msg = "Could not serialize object: %s: %s" % (
                 e.__class__.__name__, emsg)
         cloudpickle.print_exec(sys.stderr)
         raise pickle.PicklingError(msg)
Esempio n. 4
0
    def send(self, sock, msg):
        """
        Send ``msg`` to destination ``sock``.

        Args:
            sock:
            msg:

        Returns:

        """
        data = cloudpickle.dumps(msg)
        buf = struct.pack(">I", len(data)) + data
        sock.sendall(buf)
Esempio n. 5
0
    def send(self, sock, msg):
        """
        Send ``msg`` to destination ``sock``.

        Args:
            sock:
            msg:

        Returns:

        """
        if conf.is_spark_available():
            data = cloudpickle.dumps(msg)
        else:
            data = msg
        buf = struct.pack(">I", len(data)) + data
        sock.sendall(buf)
Esempio n. 6
0
    def saveMetadata(instance, path, sc, logger, extraMetadata=None):
        """
        Save the metadata of an xgboost.spark._SparkXGBEstimator or
        xgboost.spark._SparkXGBModel.
        """
        instance._validate_params()
        skipParams = ["callbacks", "xgb_model"]
        jsonParams = {}
        for p, v in instance._paramMap.items():  # pylint: disable=protected-access
            if p.name not in skipParams:
                jsonParams[p.name] = v

        extraMetadata = extraMetadata or {}
        callbacks = instance.getOrDefault(instance.callbacks)
        if callbacks is not None:
            logger.warning(
                "The callbacks parameter is saved using cloudpickle and it "
                "is not a fully self-contained format. It may fail to load "
                "with different versions of dependencies.")
            serialized_callbacks = base64.encodebytes(
                cloudpickle.dumps(callbacks)).decode("ascii")
            extraMetadata["serialized_callbacks"] = serialized_callbacks
        init_booster = instance.getOrDefault(instance.xgb_model)
        if init_booster is not None:
            extraMetadata["init_booster"] = _INIT_BOOSTER_SAVE_PATH
        DefaultParamsWriter.saveMetadata(instance,
                                         path,
                                         sc,
                                         extraMetadata=extraMetadata,
                                         paramMap=jsonParams)
        if init_booster is not None:
            ser_init_booster = serialize_booster(init_booster)
            save_path = os.path.join(path, _INIT_BOOSTER_SAVE_PATH)
            _get_spark_session().createDataFrame(
                [(ser_init_booster, )],
                ["init_booster"]).write.parquet(save_path)
Esempio n. 7
0
 def dumps(self, obj):
     return cloudpickle.dumps(obj, 2)
Esempio n. 8
0
 def dumps(self, obj):
     return cloudpickle.dumps(obj, 2)
Esempio n. 9
0
    def dumps(self, obj): return cloudpickle.dumps(obj, 2)


class MarshalSerializer(FramedSerializer):
Esempio n. 10
0
    def dumps(self, obj): return cloudpickle.dumps(obj, 2)


class MarshalSerializer(FramedSerializer):
Esempio n. 11
0
def invoke_vizier_udf(*args):
    fn = args[0]
    args = args[1:]

    class VizierUDFWrapper:
        def __init__(self):
            self.fn = None

        def export_module_decorator(self, fn):
            self.fn = fn
            return fn

    vizierdb = VizierUDFWrapper()
    exec(fn)
    return vizierdb.fn(*args)


test_case = """
@vizierdb.export_module_decorator
def apply_foo(a):
    return a + 1
"""

# test it
assert (invoke_vizier_udf(test_case, 1) == 2)

export = cloudpickle.dumps(invoke_vizier_udf)

# print(sys.argv)
with open(export_file, "wb") as out_file:
    out_file.write(export)
Esempio n. 12
0
 def run_on_worker_and_fetch_result():
     # TODO: handle possible spark exception here. # pylint: disable=fixme
     rdd = self._spark.sparkContext.parallelize([0], 1) \
         .map(lambda _: cloudpickle.dumps(func()))
     ser_res = rdd.collect()[0]
     return cloudpickle.loads(ser_res)