def getAndSetParams(instance: RL, metadata: Dict[str, Any], skipParams: Optional[List[str]] = None) -> None: """ Extract Params from metadata, and set them in the instance. """ # Set user-supplied param values for paramName in metadata["paramMap"]: param = cast("Params", instance).getParam(paramName) if skipParams is None or paramName not in skipParams: paramValue = metadata["paramMap"][paramName] cast("Params", instance).set(param, paramValue) # Set default param values majorAndMinorVersions = VersionUtils.majorMinorVersion( metadata["sparkVersion"]) major = majorAndMinorVersions[0] minor = majorAndMinorVersions[1] # For metadata file prior to Spark 2.4, there is no default section. if major > 2 or (major == 2 and minor >= 4): assert "defaultParamMap" in metadata, ( "Error loading metadata: Expected " + "`defaultParamMap` section not found") for paramName in metadata["defaultParamMap"]: paramValue = metadata["defaultParamMap"][paramName] cast("Params", instance)._setDefault(**{paramName: paramValue})
def getAndSetParams(instance, metadata, skipParams=None): """ Extract Params from metadata, and set them in the instance. """ # Set user-supplied param values for paramName in metadata['paramMap']: param = instance.getParam(paramName) if skipParams is None or paramName not in skipParams: paramValue = metadata['paramMap'][paramName] instance.set(param, paramValue) # Set default param values majorAndMinorVersions = VersionUtils.majorMinorVersion( metadata['sparkVersion']) major = majorAndMinorVersions[0] minor = majorAndMinorVersions[1] # For metadata file prior to Spark 2.4, there is no default section. if major > 2 or (major == 2 and minor >= 4): assert 'defaultParamMap' in metadata, "Error loading metadata: Expected " + \ "`defaultParamMap` section not found" for paramName in metadata['defaultParamMap']: paramValue = metadata['defaultParamMap'][paramName] instance._setDefault(**{paramName: paramValue})
def _cancel_all_jobs(self): if VersionUtils.majorMinorVersion(pyspark.__version__)[0] < 3: # Note: There's bug existing in `sparkContext.cancelJobGroup`. # See https://issues.apache.org/jira/browse/SPARK-31549 warnings.warn("For spark version < 3, pyspark cancelling job API has bugs, " "so we could not terminate running spark jobs correctly. " "See https://issues.apache.org/jira/browse/SPARK-31549 for reference.") else: self._spark.sparkContext.cancelJobGroup(self._job_group)
def run_on_worker_and_fetch_result(): # TODO: handle possible spark exception here. # pylint: disable=fixme rdd = self._spark.sparkContext.parallelize([0], 1) \ .map(lambda _: cloudpickle.dumps(func())) if VersionUtils.majorMinorVersion(pyspark.__version__)[0] < 3: ser_res = rdd.collect()[0] else: ser_res = rdd.collectWithJobGroup(self._job_group, "joblib spark jobs")[0] return cloudpickle.loads(ser_res)
def _get_max_num_concurrent_tasks(self): # maxNumConcurrentTasks() is a package private API # pylint: disable=W0212 pyspark_version = VersionUtils.majorMinorVersion(pyspark.__version__) spark_context = self._spark.sparkContext._jsc.sc() if pyspark_version < (3, 1): return spark_context.maxNumConcurrentTasks() return spark_context.maxNumConcurrentTasks( spark_context.resourceProfileManager().resourceProfileFromId(0))
def getAndSetParams(instance, metadata): """ Extract Params from metadata, and set them in the instance. """ # Set user-supplied param values for paramName in metadata['paramMap']: param = instance.getParam(paramName) paramValue = metadata['paramMap'][paramName] instance.set(param, paramValue) # Set default param values majorAndMinorVersions = VersionUtils.majorMinorVersion(metadata['sparkVersion']) major = majorAndMinorVersions[0] minor = majorAndMinorVersions[1] # For metadata file prior to Spark 2.4, there is no default section. if major > 2 or (major == 2 and minor >= 4): assert 'defaultParamMap' in metadata, "Error loading metadata: Expected " + \ "`defaultParamMap` section not found" for paramName in metadata['defaultParamMap']: paramValue = metadata['defaultParamMap'][paramName] instance._setDefault(**{paramName: paramValue})
def test_parsing_version_string(self): from pyspark.util import VersionUtils self.assertRaises(ValueError, lambda: VersionUtils.majorMinorVersion("abced"))
import threading import time import timeit import traceback from hyperopt import base, fmin, Trials from hyperopt.base import validate_timeout, validate_loss_threshold from hyperopt.utils import coarse_utcnow, _get_logger, _get_random_id try: from pyspark.sql import SparkSession from pyspark.util import VersionUtils import pyspark _have_spark = True _spark_major_minor_version = VersionUtils.majorMinorVersion( pyspark.__version__) except ImportError as e: _have_spark = False _spark_major_minor_version = None logger = _get_logger("hyperopt-spark") class SparkTrials(Trials): """ Implementation of hyperopt.Trials supporting distributed execution using Apache Spark clusters. This requires fmin to be run on a Spark cluster. Plugging SparkTrials into hyperopt.fmin() allows hyperopt to send model training and evaluation tasks to Spark workers,