def _jrdd(self): if self._jrdd_val: return self._jrdd_val func = self.func if not self._bypass_serializer and self.ctx.batchSize != 1: oldfunc = self.func batchSize = self.ctx.batchSize def batched_func(split, iterator): return batched(oldfunc(split, iterator), batchSize) func = batched_func cmds = [func, self._bypass_serializer] pipe_command = ' '.join(b64enc(cloudpickle.dumps(f)) for f in cmds) broadcast_vars = ListConverter().convert( [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], self.ctx._gateway._gateway_client) self.ctx._pickled_broadcast_vars.clear() class_manifest = self._prev_jrdd.classManifest() env = MapConverter().convert(self.ctx.environment, self.ctx._gateway._gateway_client) includes = ListConverter().convert(self.ctx._python_includes, self.ctx._gateway._gateway_client) python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), pipe_command, env, includes, self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, class_manifest) self._jrdd_val = python_rdd.asJavaRDD() return self._jrdd_val
def dumps(self, obj): try: return cloudpickle.dumps(obj, 2) except pickle.PickleError: raise except Exception as e: emsg = _exception_message(e) if "'i' format requires" in emsg: msg = "Object too large to serialize: %s" % emsg else: msg = "Could not serialize object: %s: %s" % (e.__class__.__name__, emsg) cloudpickle.print_exec(sys.stderr) raise pickle.PicklingError(msg)
def dumps(self, obj): try: return cloudpickle.dumps(obj, 2) except pickle.PickleError: raise except Exception as e: emsg = _exception_message(e) if "'i' format requires" in emsg: msg = "Object too large to serialize: %s" % emsg else: msg = "Could not serialize object: %s: %s" % ( e.__class__.__name__, emsg) cloudpickle.print_exec(sys.stderr) raise pickle.PicklingError(msg)
def send(self, sock, msg): """ Send ``msg`` to destination ``sock``. Args: sock: msg: Returns: """ data = cloudpickle.dumps(msg) buf = struct.pack(">I", len(data)) + data sock.sendall(buf)
def send(self, sock, msg): """ Send ``msg`` to destination ``sock``. Args: sock: msg: Returns: """ if conf.is_spark_available(): data = cloudpickle.dumps(msg) else: data = msg buf = struct.pack(">I", len(data)) + data sock.sendall(buf)
def saveMetadata(instance, path, sc, logger, extraMetadata=None): """ Save the metadata of an xgboost.spark._SparkXGBEstimator or xgboost.spark._SparkXGBModel. """ instance._validate_params() skipParams = ["callbacks", "xgb_model"] jsonParams = {} for p, v in instance._paramMap.items(): # pylint: disable=protected-access if p.name not in skipParams: jsonParams[p.name] = v extraMetadata = extraMetadata or {} callbacks = instance.getOrDefault(instance.callbacks) if callbacks is not None: logger.warning( "The callbacks parameter is saved using cloudpickle and it " "is not a fully self-contained format. It may fail to load " "with different versions of dependencies.") serialized_callbacks = base64.encodebytes( cloudpickle.dumps(callbacks)).decode("ascii") extraMetadata["serialized_callbacks"] = serialized_callbacks init_booster = instance.getOrDefault(instance.xgb_model) if init_booster is not None: extraMetadata["init_booster"] = _INIT_BOOSTER_SAVE_PATH DefaultParamsWriter.saveMetadata(instance, path, sc, extraMetadata=extraMetadata, paramMap=jsonParams) if init_booster is not None: ser_init_booster = serialize_booster(init_booster) save_path = os.path.join(path, _INIT_BOOSTER_SAVE_PATH) _get_spark_session().createDataFrame( [(ser_init_booster, )], ["init_booster"]).write.parquet(save_path)
def dumps(self, obj): return cloudpickle.dumps(obj, 2)
def dumps(self, obj): return cloudpickle.dumps(obj, 2) class MarshalSerializer(FramedSerializer):
def invoke_vizier_udf(*args): fn = args[0] args = args[1:] class VizierUDFWrapper: def __init__(self): self.fn = None def export_module_decorator(self, fn): self.fn = fn return fn vizierdb = VizierUDFWrapper() exec(fn) return vizierdb.fn(*args) test_case = """ @vizierdb.export_module_decorator def apply_foo(a): return a + 1 """ # test it assert (invoke_vizier_udf(test_case, 1) == 2) export = cloudpickle.dumps(invoke_vizier_udf) # print(sys.argv) with open(export_file, "wb") as out_file: out_file.write(export)
def run_on_worker_and_fetch_result(): # TODO: handle possible spark exception here. # pylint: disable=fixme rdd = self._spark.sparkContext.parallelize([0], 1) \ .map(lambda _: cloudpickle.dumps(func())) ser_res = rdd.collect()[0] return cloudpickle.loads(ser_res)