Example #1
0
    def getAndSetParams(instance: RL,
                        metadata: Dict[str, Any],
                        skipParams: Optional[List[str]] = None) -> None:
        """
        Extract Params from metadata, and set them in the instance.
        """
        # Set user-supplied param values
        for paramName in metadata["paramMap"]:
            param = cast("Params", instance).getParam(paramName)
            if skipParams is None or paramName not in skipParams:
                paramValue = metadata["paramMap"][paramName]
                cast("Params", instance).set(param, paramValue)

        # Set default param values
        majorAndMinorVersions = VersionUtils.majorMinorVersion(
            metadata["sparkVersion"])
        major = majorAndMinorVersions[0]
        minor = majorAndMinorVersions[1]

        # For metadata file prior to Spark 2.4, there is no default section.
        if major > 2 or (major == 2 and minor >= 4):
            assert "defaultParamMap" in metadata, (
                "Error loading metadata: Expected " +
                "`defaultParamMap` section not found")

            for paramName in metadata["defaultParamMap"]:
                paramValue = metadata["defaultParamMap"][paramName]
                cast("Params", instance)._setDefault(**{paramName: paramValue})
Example #2
0
    def getAndSetParams(instance, metadata, skipParams=None):
        """
        Extract Params from metadata, and set them in the instance.
        """
        # Set user-supplied param values
        for paramName in metadata['paramMap']:
            param = instance.getParam(paramName)
            if skipParams is None or paramName not in skipParams:
                paramValue = metadata['paramMap'][paramName]
                instance.set(param, paramValue)

        # Set default param values
        majorAndMinorVersions = VersionUtils.majorMinorVersion(
            metadata['sparkVersion'])
        major = majorAndMinorVersions[0]
        minor = majorAndMinorVersions[1]

        # For metadata file prior to Spark 2.4, there is no default section.
        if major > 2 or (major == 2 and minor >= 4):
            assert 'defaultParamMap' in metadata, "Error loading metadata: Expected " + \
                "`defaultParamMap` section not found"

            for paramName in metadata['defaultParamMap']:
                paramValue = metadata['defaultParamMap'][paramName]
                instance._setDefault(**{paramName: paramValue})
Example #3
0
 def _cancel_all_jobs(self):
     if VersionUtils.majorMinorVersion(pyspark.__version__)[0] < 3:
         # Note: There's bug existing in `sparkContext.cancelJobGroup`.
         # See https://issues.apache.org/jira/browse/SPARK-31549
         warnings.warn("For spark version < 3, pyspark cancelling job API has bugs, "
                       "so we could not terminate running spark jobs correctly. "
                       "See https://issues.apache.org/jira/browse/SPARK-31549 for reference.")
     else:
         self._spark.sparkContext.cancelJobGroup(self._job_group)
Example #4
0
 def run_on_worker_and_fetch_result():
     # TODO: handle possible spark exception here. # pylint: disable=fixme
     rdd = self._spark.sparkContext.parallelize([0], 1) \
         .map(lambda _: cloudpickle.dumps(func()))
     if VersionUtils.majorMinorVersion(pyspark.__version__)[0] < 3:
         ser_res = rdd.collect()[0]
     else:
         ser_res = rdd.collectWithJobGroup(self._job_group, "joblib spark jobs")[0]
     return cloudpickle.loads(ser_res)
Example #5
0
 def _get_max_num_concurrent_tasks(self):
     # maxNumConcurrentTasks() is a package private API
     # pylint: disable=W0212
     pyspark_version = VersionUtils.majorMinorVersion(pyspark.__version__)
     spark_context = self._spark.sparkContext._jsc.sc()
     if pyspark_version < (3, 1):
         return spark_context.maxNumConcurrentTasks()
     return spark_context.maxNumConcurrentTasks(
         spark_context.resourceProfileManager().resourceProfileFromId(0))
Example #6
0
    def getAndSetParams(instance, metadata):
        """
        Extract Params from metadata, and set them in the instance.
        """
        # Set user-supplied param values
        for paramName in metadata['paramMap']:
            param = instance.getParam(paramName)
            paramValue = metadata['paramMap'][paramName]
            instance.set(param, paramValue)

        # Set default param values
        majorAndMinorVersions = VersionUtils.majorMinorVersion(metadata['sparkVersion'])
        major = majorAndMinorVersions[0]
        minor = majorAndMinorVersions[1]

        # For metadata file prior to Spark 2.4, there is no default section.
        if major > 2 or (major == 2 and minor >= 4):
            assert 'defaultParamMap' in metadata, "Error loading metadata: Expected " + \
                "`defaultParamMap` section not found"

            for paramName in metadata['defaultParamMap']:
                paramValue = metadata['defaultParamMap'][paramName]
                instance._setDefault(**{paramName: paramValue})
Example #7
0
 def test_parsing_version_string(self):
     from pyspark.util import VersionUtils
     self.assertRaises(ValueError,
                       lambda: VersionUtils.majorMinorVersion("abced"))
Example #8
0
 def test_parsing_version_string(self):
     from pyspark.util import VersionUtils
     self.assertRaises(ValueError, lambda: VersionUtils.majorMinorVersion("abced"))
Example #9
0
import threading
import time
import timeit
import traceback

from hyperopt import base, fmin, Trials
from hyperopt.base import validate_timeout, validate_loss_threshold
from hyperopt.utils import coarse_utcnow, _get_logger, _get_random_id

try:
    from pyspark.sql import SparkSession
    from pyspark.util import VersionUtils
    import pyspark

    _have_spark = True
    _spark_major_minor_version = VersionUtils.majorMinorVersion(
        pyspark.__version__)
except ImportError as e:
    _have_spark = False
    _spark_major_minor_version = None

logger = _get_logger("hyperopt-spark")


class SparkTrials(Trials):
    """
    Implementation of hyperopt.Trials supporting
    distributed execution using Apache Spark clusters.
    This requires fmin to be run on a Spark cluster.

    Plugging SparkTrials into hyperopt.fmin() allows hyperopt
    to send model training and evaluation tasks to Spark workers,