Esempi in Python per Param.Param, esempi in Python per pyspark.ml.param.Param.Param

Esempio n. 1

0

Mostra file

File: shared.py Progetto: mindis/snappy-spark

class HasMaxIter(Params):
    """
    Mixin for param maxIter: max number of iterations (>= 0).
    """

    maxIter = Param(Params._dummy(), "maxIter",
                    "max number of iterations (>= 0).", int)

    def __init__(self):
        super(HasMaxIter, self).__init__()

    def setMaxIter(self, value):
        """
        Sets the value of :py:attr:`maxIter`.
        """
        self._set(maxIter=value)
        return self

    def getMaxIter(self):
        """
        Gets the value of maxIter or its default value.
        """
        return self.getOrDefault(self.maxIter)

Esempio n. 2

0

Mostra file

File: tuning.py Progetto: msamirkhan/spark

class _TrainValidationSplitParams(_ValidatorParams):
    """
    Params for :py:class:`TrainValidationSplit` and :py:class:`TrainValidationSplitModel`.

    .. versionadded:: 3.0.0
    """

    trainRatio = Param(Params._dummy(),
                       "trainRatio",
                       "Param for ratio between train and\
     validation data. Must be between 0 and 1.",
                       typeConverter=TypeConverters.toFloat)

    def __init__(self, *args):
        super(_TrainValidationSplitParams, self).__init__(*args)
        self._setDefault(trainRatio=0.75)

    @since("2.0.0")
    def getTrainRatio(self):
        """
        Gets the value of trainRatio or its default value.
        """
        return self.getOrDefault(self.trainRatio)

Esempio n. 3

0

Mostra file

File: shared_params.py Progetto: zhangjiekui/spark-deep-learning

class HasOutputCol(Params):
    """
    Mixin for param outputCol: output column name.
    """

    outputCol = Param(Params._dummy(), "outputCol", "output column name.",
                      typeConverter=TypeConverters.toString)

    def __init__(self):
        super(HasOutputCol, self).__init__()
        self._setDefault(outputCol=self.uid + '__output')

    def setOutputCol(self, value):
        """
        Sets the value of :py:attr:`outputCol`.
        """
        return self._set(outputCol=value)

    def getOutputCol(self):
        """
        Gets the value of outputCol or its default value.
        """
        return self.getOrDefault(self.outputCol)

Esempio n. 4

0

Mostra file

File: shared.py Progetto: mindis/snappy-spark

class HasTol(Params):
    """
    Mixin for param tol: the convergence tolerance for iterative algorithms.
    """

    tol = Param(Params._dummy(), "tol",
                "the convergence tolerance for iterative algorithms.", float)

    def __init__(self):
        super(HasTol, self).__init__()

    def setTol(self, value):
        """
        Sets the value of :py:attr:`tol`.
        """
        self._set(tol=value)
        return self

    def getTol(self):
        """
        Gets the value of tol or its default value.
        """
        return self.getOrDefault(self.tol)

Esempio n. 5

0

Mostra file

File: shared.py Progetto: mindis/snappy-spark

class HasLabelCol(Params):
    """
    Mixin for param labelCol: label column name.
    """

    labelCol = Param(Params._dummy(), "labelCol", "label column name.", str)

    def __init__(self):
        super(HasLabelCol, self).__init__()
        self._setDefault(labelCol='label')

    def setLabelCol(self, value):
        """
        Sets the value of :py:attr:`labelCol`.
        """
        self._set(labelCol=value)
        return self

    def getLabelCol(self):
        """
        Gets the value of labelCol or its default value.
        """
        return self.getOrDefault(self.labelCol)

Esempio n. 6

0

Mostra file

File: shared.py Progetto: mindis/snappy-spark

class HasRegParam(Params):
    """
    Mixin for param regParam: regularization parameter (>= 0).
    """

    regParam = Param(Params._dummy(), "regParam",
                     "regularization parameter (>= 0).", float)

    def __init__(self):
        super(HasRegParam, self).__init__()

    def setRegParam(self, value):
        """
        Sets the value of :py:attr:`regParam`.
        """
        self._set(regParam=value)
        return self

    def getRegParam(self):
        """
        Gets the value of regParam or its default value.
        """
        return self.getOrDefault(self.regParam)

Esempio n. 7

0

Mostra file

File: shared.py Progetto: mindis/snappy-spark

class HasSeed(Params):
    """
    Mixin for param seed: random seed.
    """

    seed = Param(Params._dummy(), "seed", "random seed.", int)

    def __init__(self):
        super(HasSeed, self).__init__()
        self._setDefault(seed=hash(type(self).__name__))

    def setSeed(self, value):
        """
        Sets the value of :py:attr:`seed`.
        """
        self._set(seed=value)
        return self

    def getSeed(self):
        """
        Gets the value of seed or its default value.
        """
        return self.getOrDefault(self.seed)

Esempio n. 8

0

Mostra file

File: shared.py Progetto: mindis/snappy-spark

class HasNumFeatures(Params):
    """
    Mixin for param numFeatures: number of features.
    """

    numFeatures = Param(Params._dummy(), "numFeatures", "number of features.",
                        int)

    def __init__(self):
        super(HasNumFeatures, self).__init__()

    def setNumFeatures(self, value):
        """
        Sets the value of :py:attr:`numFeatures`.
        """
        self._set(numFeatures=value)
        return self

    def getNumFeatures(self):
        """
        Gets the value of numFeatures or its default value.
        """
        return self.getOrDefault(self.numFeatures)

Esempio n. 9

0

Mostra file

File: shared.py Progetto: mindis/snappy-spark

class HasInputCols(Params):
    """
    Mixin for param inputCols: input column names.
    """

    inputCols = Param(Params._dummy(), "inputCols", "input column names.",
                      None)

    def __init__(self):
        super(HasInputCols, self).__init__()

    def setInputCols(self, value):
        """
        Sets the value of :py:attr:`inputCols`.
        """
        self._set(inputCols=value)
        return self

    def getInputCols(self):
        """
        Gets the value of inputCols or its default value.
        """
        return self.getOrDefault(self.inputCols)

Esempio n. 10

0

Mostra file

File: keyed_models.py Progetto: yonglehou/spark-sklearn

    def __init__(self,
                 keyCols=None,
                 xCol=None,
                 outputCol=None,
                 yCol=None,
                 estimatorType=None,
                 keyedSklearnEstimators=None,
                 outputType=None):
        """The constructor is used by :class:`KeyedEstimator` to generate a :class:`KeyedModel`; it
        is not intended for external use."""

        assert (estimatorType == "predictor") == (yCol is not None), \
            "yCol is {}, but it should {}be None for a {} estimatorType".format(
                yCol, "not " if isLabelled else "", estimatorType)
        assert estimatorType == "transformer" or estimatorType == "predictor", estimatorType

        def implies(a, b):
            return not a or b

        assert implies(estimatorType == "transformer",
                       outputType == Vector.__UDT__), outputType
        assert len(keyCols) > 0, len(keyCols)
        assert set(keyedSklearnEstimators.columns) == (set(keyCols) | set(["estimator"])), \
            "keyedSklearnEstimator columns {} should have both key columns {} and " + \
            "an estimator column".format(keyedSklearnEstimators.columns, keyCols)

        # The superclass expects Param attributes to already be set, so we only init it after
        # doing so.
        for paramName, paramSpec in KeyedModel._paramSpecs.items():
            setattr(self, paramName,
                    Param(Params._dummy(), paramName, paramSpec["doc"]))
        super(KeyedModel, self).__init__()
        if yCol and type(outputType) not in KeyedModel._sql_types:
            raise TypeError(
                "Output type {} is not an AtomicType (expected for {} estimator)"
                .format(outputType, estimatorType))
        self._set(**self.__init__._input_kwargs)

Esempio n. 11

0

Mostra file

File: pandas_udf_transformer.py Progetto: JohnHBauer/flattenVectorUDT

class HasFunctionType(Params):
    """
    Mixin for param functionType: constant value, interpreted as defined by PythonEvalType.
    """

    functionType = Param(Params._dummy(), "functionType", "constant value, interpreted as defined by PythonEvalType",
                         typeConverter=TypeConverters.toInt)

    def __init__(self):
        super(HasFunctionType, self).__init__()

    def setFunctionType(self, value):
        """
        Sets the value of :py:attr:`functionType`.

        Valid values are defined in PythonEvalType:

        SQL_SCALAR_PANDAS_UDF = 200
        SQL_GROUPED_MAP_PANDAS_UDF = 201
        SQL_GROUPED_AGG_PANDAS_UDF = 202
        SQL_WINDOW_AGG_PANDAS_UDF = 203
        """
        if value not in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                         PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                         PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                         PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF,
                         ):
            raise ValueError("Function type must be some kind of Pandas_UDF")

        return self._set(functionType=value)

    def getFunctionType(self):
        """
        Gets the value of functionType or its default value.
        """
        return self.getOrDefault(self.functionType)

Esempio n. 12

0

Mostra file

File: shared.py Progetto: mindis/snappy-spark

class HasFeaturesCol(Params):
    """
    Mixin for param featuresCol: features column name.
    """

    featuresCol = Param(Params._dummy(), "featuresCol",
                        "features column name.", str)

    def __init__(self):
        super(HasFeaturesCol, self).__init__()
        self._setDefault(featuresCol='features')

    def setFeaturesCol(self, value):
        """
        Sets the value of :py:attr:`featuresCol`.
        """
        self._set(featuresCol=value)
        return self

    def getFeaturesCol(self):
        """
        Gets the value of featuresCol or its default value.
        """
        return self.getOrDefault(self.featuresCol)

Esempio n. 13

0

Mostra file

File: shared.py Progetto: mindis/snappy-spark

class HasFitIntercept(Params):
    """
    Mixin for param fitIntercept: whether to fit an intercept term.
    """

    fitIntercept = Param(Params._dummy(), "fitIntercept",
                         "whether to fit an intercept term.", bool)

    def __init__(self):
        super(HasFitIntercept, self).__init__()
        self._setDefault(fitIntercept=True)

    def setFitIntercept(self, value):
        """
        Sets the value of :py:attr:`fitIntercept`.
        """
        self._set(fitIntercept=value)
        return self

    def getFitIntercept(self):
        """
        Gets the value of fitIntercept or its default value.
        """
        return self.getOrDefault(self.fitIntercept)

Esempio n. 14

0

Mostra file

File: shared.py Progetto: mindis/snappy-spark

class HasStepSize(Params):
    """
    Mixin for param stepSize: Step size to be used for each iteration of optimization.
    """

    stepSize = Param(
        Params._dummy(), "stepSize",
        "Step size to be used for each iteration of optimization.", float)

    def __init__(self):
        super(HasStepSize, self).__init__()

    def setStepSize(self, value):
        """
        Sets the value of :py:attr:`stepSize`.
        """
        self._set(stepSize=value)
        return self

    def getStepSize(self):
        """
        Gets the value of stepSize or its default value.
        """
        return self.getOrDefault(self.stepSize)

Esempio n. 15

0

Mostra file

class HasInputCol(Params):
    """
    Mixin for param inputCol: input column name.
    """

    inputCol = Param(Params._dummy(),
                     "inputCol",
                     "input column name.",
                     typeConverter=TypeConverters.toString)

    def __init__(self):
        super(HasInputCol, self).__init__()

    def setInputCol(self, value):
        """
        Sets the value of :py:attr:`inputCol`.
        """
        return self._set(inputCol=value)

    def getInputCol(self):
        """
        Gets the value of inputCol or its default value.
        """
        return self.getOrDefault(self.inputCol)

Esempio n. 16

0

Mostra file

File: test_param.py Progetto: zjureel/spark

    def test_params(self):
        testParams = TestParams()
        maxIter = testParams.maxIter
        inputCol = testParams.inputCol
        seed = testParams.seed

        params = testParams.params
        self.assertEqual(params, [inputCol, maxIter, seed])

        self.assertTrue(testParams.hasParam(maxIter.name))
        self.assertTrue(testParams.hasDefault(maxIter))
        self.assertFalse(testParams.isSet(maxIter))
        self.assertTrue(testParams.isDefined(maxIter))
        self.assertEqual(testParams.getMaxIter(), 10)

        self.assertTrue(testParams.hasParam(inputCol.name))
        self.assertFalse(testParams.hasDefault(inputCol))
        self.assertFalse(testParams.isSet(inputCol))
        self.assertFalse(testParams.isDefined(inputCol))
        with self.assertRaises(KeyError):
            testParams.getInputCol()

        otherParam = Param(Params._dummy(), "otherParam", "Parameter used to test that " +
                           "set raises an error for a non-member parameter.",
                           typeConverter=TypeConverters.toString)
        with self.assertRaises(ValueError):
            testParams.set(otherParam, "value")

        # Since the default is normally random, set it to a known number for debug str
        testParams._setDefault(seed=41)

        self.assertEqual(
            testParams.explainParams(),
            "\n".join(["inputCol: input column name. (undefined)",
                       "maxIter: max number of iterations (>= 0). (default: 10)",
                       "seed: random seed. (default: 41)"]))

Esempio n. 17

0

Mostra file

File: shared.py Progetto: mindis/snappy-spark

class HasPredictionCol(Params):
    """
    Mixin for param predictionCol: prediction column name.
    """

    predictionCol = Param(Params._dummy(), "predictionCol",
                          "prediction column name.", str)

    def __init__(self):
        super(HasPredictionCol, self).__init__()
        self._setDefault(predictionCol='prediction')

    def setPredictionCol(self, value):
        """
        Sets the value of :py:attr:`predictionCol`.
        """
        self._set(predictionCol=value)
        return self

    def getPredictionCol(self):
        """
        Gets the value of predictionCol or its default value.
        """
        return self.getOrDefault(self.predictionCol)

Esempio n. 18

0

Mostra file

File: framework_kafka_reader.py Progetto: imranq2/SparkPipelineFramework

    def __init__(
        self,
        kafka_brokers: str,
        topic: str,
        schema: StructType,
        starting_offset: int = -2,
        use_ssl: bool = True,
        previous_checkpoint_view: Optional[str] = None,
        name: Optional[str] = None,
        parameters: Optional[Dict[str, Any]] = None,
        progress_logger: Optional[ProgressLogger] = None,
    ):
        super().__init__()

        self.logger = get_logger(__name__)

        self.kafka_brokers: Param[str] = Param(self, "kafka_brokers", "")
        self._setDefault(kafka_brokers=kafka_brokers)

        self.topic: Param[str] = Param(self, "topic", "")
        self._setDefault(topic=topic)

        self.schema: Param[StructType] = Param(self, "schema", "")
        self._setDefault(schema=schema)

        self.starting_offset: Param[int] = Param(self, "starting_offset", "")
        self._setDefault(starting_offset=starting_offset)

        self.use_ssl: Param[bool] = Param(self, "use_ssl", "")
        self._setDefault(use_ssl=use_ssl)

        self.previous_checkpoint_view: Param[Optional[str]] = Param(
            self, "previous_checkpoint_view", ""
        )
        self._setDefault(previous_checkpoint_view=previous_checkpoint_view)

        kwargs = self._input_kwargs
        self.setParams(**kwargs)

Esempio n. 19

0

Mostra file

class _ValidatorParams(HasSeed):
    """
    Common params for TrainValidationSplit and CrossValidator.
    """

    estimator = Param(Params._dummy(), "estimator",
                      "estimator to be cross-validated")
    estimatorParamMaps = Param(Params._dummy(), "estimatorParamMaps",
                               "estimator param maps")
    evaluator = Param(
        Params._dummy(), "evaluator",
        "evaluator used to select hyper-parameters that maximize the validator metric"
    )

    @since("2.0.0")
    def getEstimator(self):
        """
        Gets the value of estimator or its default value.
        """
        return self.getOrDefault(self.estimator)

    @since("2.0.0")
    def getEstimatorParamMaps(self):
        """
        Gets the value of estimatorParamMaps or its default value.
        """
        return self.getOrDefault(self.estimatorParamMaps)

    @since("2.0.0")
    def getEvaluator(self):
        """
        Gets the value of evaluator or its default value.
        """
        return self.getOrDefault(self.evaluator)

    @classmethod
    def _from_java_impl(cls, java_stage):
        """
        Return Python estimator, estimatorParamMaps, and evaluator from a Java ValidatorParams.
        """

        # Load information from java_stage to the instance.
        estimator = JavaParams._from_java(java_stage.getEstimator())
        evaluator = JavaParams._from_java(java_stage.getEvaluator())
        epms = [
            estimator._transfer_param_map_from_java(epm)
            for epm in java_stage.getEstimatorParamMaps()
        ]
        return estimator, epms, evaluator

    def _to_java_impl(self):
        """
        Return Java estimator, estimatorParamMaps, and evaluator from this Python instance.
        """

        gateway = SparkContext._gateway
        cls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap

        java_epms = gateway.new_array(cls, len(self.getEstimatorParamMaps()))
        for idx, epm in enumerate(self.getEstimatorParamMaps()):
            java_epms[idx] = self.getEstimator()._transfer_param_map_to_java(
                epm)

        java_estimator = self.getEstimator()._to_java()
        java_evaluator = self.getEvaluator()._to_java()
        return java_estimator, java_epms, java_evaluator

Esempio n. 20

0

Mostra file

File: PCASageMakerEstimator.py Progetto: nadiaya/test

class PCASageMakerEstimator(SageMakerEstimatorBase):
    """
    A :class:`~sagemaker_pyspark.SageMakerEstimator` that runs a PCA training job in SageMaker and
    returns a :class:`~sagemaker_pyspark.SageMakerModel` that can be used to transform a DataFrame
    using the hosted PCA  model. PCA, or Principal Component Analysis, is useful for reducing the
    dimensionality of data before training with another algorithm.

    Amazon SageMaker PCA trains on RecordIO-encoded Amazon Record protobuf data.
    SageMaker pyspark writes a DataFrame to S3 by selecting a column of Vectors named "features"
    and, if present, a column of Doubles named "label". These names are configurable by passing a
    dictionary with entries in trainingSparkDataFormatOptions with key "labelColumnName" or
    "featuresColumnName", with values corresponding to the desired label and features columns.

    PCASageMakerEstimator uses
    :class:`~sagemaker_pyspark.transformation.serializers.ProtobufRequestRowSerializer` to serialize
     Rows into RecordIO-encoded Amazon Record protobuf messages for inference, by default selecting
    the column named "features" expected to contain a Vector of Doubles.

    Inferences made against an Endpoint hosting a PCA model contain a "projection" field appended
    to the input DataFrame as a Dense Vector of Doubles.

    Args:
        sageMakerRole (IAMRole): The SageMaker TrainingJob and Hosting IAM Role. Used by
            SageMaker to access S3 and ECR Resources. SageMaker hosted Endpoint instances
            launched by this Estimator run with this role.
        trainingInstanceType (str): The SageMaker TrainingJob Instance Type to use.
        trainingInstanceCount (int): The number of instances of instanceType to run an
            SageMaker Training Job with.
        endpointInstanceType (str): The SageMaker Endpoint Config instance type.
        endpointInitialInstanceCount (int): The SageMaker Endpoint Config minimum number of
            instances that can be used to host modelImage.
        requestRowSerializer (RequestRowSerializer): Serializes Spark DataFrame Rows for
            transformation by Models built from this Estimator.
        responseRowDeserializer (ResponseRowDeserializer): Deserializes an Endpoint response into a
            series of Rows.
        trainingInputS3DataPath (S3Resource): An S3 location to upload SageMaker Training Job input
            data to.
        trainingOutputS3DataPath (S3Resource): An S3 location for SageMaker to store Training Job
            output data to.
        trainingInstanceVolumeSizeInGB (int): The EBS volume size in gigabytes of each instance.
        trainingProjectedColumns (List): The columns to project from the Dataset being fit before
            training. If an Optional.empty is passed then no specific projection will occur and
            all columns will be serialized.
        trainingChannelName (str): The SageMaker Channel name to input serialized Dataset fit
            input to.
        trainingContentType (str): The MIME type of the training data.
        trainingS3DataDistribution (str): The SageMaker Training Job S3 data distribution scheme.
        trainingSparkDataFormat (str): The Spark Data Format name used to serialize the Dataset
            being fit for input to SageMaker.
        trainingSparkDataFormatOptions (dict): The Spark Data Format Options used during
            serialization of the Dataset being fit.
        trainingInputMode (str): The SageMaker Training Job Channel input mode.
        trainingCompressionCodec (str): The type of compression to use when serializing the
            Dataset being fit for input to SageMaker.
        trainingMaxRuntimeInSeconds (int): A SageMaker Training Job Termination Condition
            MaxRuntimeInHours.
        trainingKmsKeyId (str): A KMS key ID for the Output Data Source.
        modelEnvironmentVariables (dict): The environment variables that SageMaker will set on the
            model container during execution.
        endpointCreationPolicy (EndpointCreationPolicy): Defines how a SageMaker Endpoint
            referenced by a SageMakerModel is created.
        sagemakerClient (AmazonSageMaker) Amazon SageMaker client. Used to send CreateTrainingJob,
            CreateModel, and CreateEndpoint requests.
        region (str): The region in which to run the algorithm. If not specified, gets the region
            from the DefaultAwsRegionProviderChain.
        s3Client (AmazonS3): Used to create a bucket for staging SageMaker Training Job
            input and/or output if either are set to S3AutoCreatePath.
        stsClient (AmazonSTS): Used to resolve the account number when creating staging
            input / output buckets.
        modelPrependInputRowsToTransformationRows (bool): Whether the transformation result on
            Models built by this Estimator should also include the input Rows. If true,
            each output Row is formed by a concatenation of the input Row with the corresponding
            Row produced by SageMaker Endpoint invocation, produced by responseRowDeserializer.
            If false, each output Row is just taken from responseRowDeserializer.
        deleteStagingDataAfterTraining (bool): Whether to remove the training data on s3 after
            training is complete or failed.
        namePolicyFactory (NamePolicyFactory): The NamePolicyFactory to use when naming SageMaker
            entities created during fit.
        uid (str): The unique identifier of this Estimator. Used to represent this stage in Spark
            ML pipelines.
       """
    _wrapped_class = "com.amazonaws.services.sagemaker.sparksdk.algorithms.PCASageMakerEstimator"

    num_components = Param(
        Params._dummy(),
        "num_components",
        "Number of principal components we wish to compute. Must be > 0",
        typeConverter=TypeConverters.toInt)

    algorithm_mode = Param(
        Params._dummy(),
        "algorithm_mode",
        "Determines the algorithm computing the principal components" +
        "Supported options: 'regular', 'stable' and 'randomized'.",
        typeConverter=TypeConverters.toString)

    subtract_mean = Param(
        Params._dummy(),
        "subtract_mean",
        "If true, the data will be unbiased both during training and " +
        "inference",
        typeConverter=TypeConverters.toString)

    extra_components = Param(
        Params._dummy(),
        "extra_components",
        "Number of extra components to compute" +
        "Valid for 'randomized' mode. Ignored by other modes."
        " Must be -1 or > 0",
        typeConverter=TypeConverters.toInt)

    mini_batch_size = Param(
        Params._dummy(),
        "mini_batch_size",
        "The number of examples in a mini-batch. Must be > 0",
        typeConverter=TypeConverters.toInt)

    feature_dim = Param(Params._dummy(),
                        "feature_dim",
                        "The dimension of the input vectors. Must be > 0",
                        typeConverter=TypeConverters.toInt)

    def __init__(
            self,
            trainingInstanceType,
            trainingInstanceCount,
            endpointInstanceType,
            endpointInitialInstanceCount,
            sagemakerRole=IAMRoleFromConfig(),
            requestRowSerializer=ProtobufRequestRowSerializer(),
            responseRowDeserializer=PCAProtobufResponseRowDeserializer(),
            trainingInputS3DataPath=S3AutoCreatePath(),
            trainingOutputS3DataPath=S3AutoCreatePath(),
            trainingInstanceVolumeSizeInGB=1024,
            trainingProjectedColumns=None,
            trainingChannelName="train",
            trainingContentType=None,
            trainingS3DataDistribution="ShardedByS3Key",
            trainingSparkDataFormat="sagemaker",
            trainingSparkDataFormatOptions=None,
            trainingInputMode="File",
            trainingCompressionCodec=None,
            trainingMaxRuntimeInSeconds=24 * 60 * 60,
            trainingKmsKeyId=None,
            modelEnvironmentVariables=None,
            endpointCreationPolicy=EndpointCreationPolicy.CREATE_ON_CONSTRUCT,
            sagemakerClient=SageMakerClients.create_sagemaker_client(),
            region=None,
            s3Client=SageMakerClients.create_s3_default_client(),
            stsClient=SageMakerClients.create_sts_default_client(),
            modelPrependInputRowsToTransformationRows=True,
            deleteStagingDataAfterTraining=True,
            namePolicyFactory=RandomNamePolicyFactory(),
            uid=None):

        if trainingSparkDataFormatOptions is None:
            trainingSparkDataFormatOptions = {}

        if modelEnvironmentVariables is None:
            modelEnvironmentVariables = {}

        if uid is None:
            uid = Identifiable._randomUID()

        kwargs = locals().copy()
        del kwargs['self']

        super(PCASageMakerEstimator, self).__init__(**kwargs)
        default_params = {'subtract_mean': 'True'}

        self._setDefault(**default_params)

    def _get_java_obj(self, **kwargs):
        return self._new_java_obj(
            PCASageMakerEstimator._wrapped_class, kwargs['sagemakerRole'],
            kwargs['trainingInstanceType'], kwargs['trainingInstanceCount'],
            kwargs['endpointInstanceType'],
            kwargs['endpointInitialInstanceCount'],
            kwargs['requestRowSerializer'], kwargs['responseRowDeserializer'],
            kwargs['trainingInputS3DataPath'],
            kwargs['trainingOutputS3DataPath'],
            kwargs['trainingInstanceVolumeSizeInGB'],
            Option(kwargs['trainingProjectedColumns']),
            kwargs['trainingChannelName'], Option(
                kwargs['trainingContentType']),
            kwargs['trainingS3DataDistribution'],
            kwargs['trainingSparkDataFormat'],
            kwargs['trainingSparkDataFormatOptions'],
            kwargs['trainingInputMode'],
            Option(kwargs['trainingCompressionCodec']),
            kwargs['trainingMaxRuntimeInSeconds'],
            Option(kwargs['trainingKmsKeyId']),
            kwargs['modelEnvironmentVariables'],
            kwargs['endpointCreationPolicy'], kwargs['sagemakerClient'],
            Option(kwargs['region']), kwargs['s3Client'], kwargs['stsClient'],
            kwargs['modelPrependInputRowsToTransformationRows'],
            kwargs['deleteStagingDataAfterTraining'],
            kwargs['namePolicyFactory'], kwargs['uid'])

    def getNumComponents(self):
        return self.getOrDefault(self.num_components)

    def setNumComponents(self, value):
        if value < 1:
            raise ValueError("num_components must be > 0, got: %s" % value)
        self._set(num_components=value)

    def getAlgorithmMode(self):
        return self.getOrDefault(self.algorithm_mode)

    def setAlgorithmMode(self, value):
        if value not in ('regular', 'stable', 'randomized'):
            raise ValueError(
                "AlgorithmMode must be 'random', 'stable' or 'randomized',"
                " got %s" % value)
        self._set(algorithm_mode=value)

    def getSubtractMean(self):
        value = self.getOrDefault(self.subtract_mean)
        if value == 'True':
            return True
        else:
            return False

    def setSubtractMean(self, value):
        if value not in ('True', 'False'):
            raise ValueError("SubtractMean must be 'True' or 'False', got %s" %
                             value)
        self._set(subtract_mean=value)

    def getExtraComponents(self):
        return self.getOrDefault(self.extra_components)

    def setExtraComponents(self, value):
        if value != -1 and value < 1:
            raise ValueError("ExtraComponents must be > 0 or -1, got : %s" %
                             value)
        self._set(extra_components=value)

    def getMiniBatchSize(self):
        return self.getOrDefault(self.mini_batch_size)

    def setMiniBatchSize(self, size):
        if size <= 0:
            raise ValueError("mini_batch_size must be > 0. Got %s" % size)
        self._set(mini_batch_size=size)

    def getFeatureDim(self):
        return self.getOrDefault(self.feature_dim)

    def setFeatureDim(self, value):
        if value <= 0:
            raise ValueError("feature_dim must be > 0. Got %s" % value)
        self._set(feature_dim=value)

    @classmethod
    def _from_java(cls, javaObject):
        return PCASageMakerEstimator(sagemakerRole=None, javaObject=javaObject)

Esempio n. 21

0

Mostra file

File: tests.py Progetto: yoavfreund/spark

 def __init__(self):
     super(HasFake, self).__init__()
     self.fake = Param(self, "fake", "fake param")

Esempio n. 22

0

Mostra file

File: tests.py Progetto: yoavfreund/spark

 def __init__(self):
     super(HasInducedError, self).__init__()
     self.inducedError = Param(
         self, "inducedError",
         "Uniformly-distributed error added to feature")

Esempio n. 23

0

Mostra file

File: tests.py Progetto: yoavfreund/spark

 def __init__(self):
     super(HasThrowableProperty, self).__init__()
     self.p = Param(self, "none", "empty param")

Esempio n. 24

0

Mostra file

class DeepImagePredictor(Transformer, HasInputCol, HasOutputCol):
    """
    Applies the model specified by its popular name to the image column in DataFrame.
    The output is a MLlib Vector.
    """

    modelName = Param(
        Params._dummy(),
        "modelName",
        "A deep learning model name",
        typeConverter=SparkDLTypeConverters.supportedNameConverter(
            SUPPORTED_MODELS))
    decodePredictions = Param(
        Params._dummy(),
        "decodePredictions",
        "If true, output predictions in the (class, description, probability) format",
        typeConverter=TypeConverters.toBoolean)
    topK = Param(Params._dummy(),
                 "topK",
                 "How many classes to return if decodePredictions is True",
                 typeConverter=TypeConverters.toInt)

    @keyword_only
    def __init__(self,
                 inputCol=None,
                 outputCol=None,
                 modelName=None,
                 decodePredictions=False,
                 topK=5):
        """
        __init__(self, inputCol=None, outputCol=None, modelName=None, decodePredictions=False,
                 topK=5)
        """
        super(DeepImagePredictor, self).__init__()
        self._setDefault(decodePredictions=False)
        self._setDefault(topK=5)
        kwargs = self._input_kwargs
        self.setParams(**kwargs)

    @keyword_only
    def setParams(self,
                  inputCol=None,
                  outputCol=None,
                  modelName=None,
                  decodePredictions=False,
                  topK=5):
        """
        setParams(self, inputCol=None, outputCol=None, modelName=None, decodePredictions=False,
                  topK=5)
        """
        kwargs = self._input_kwargs
        self._set(**kwargs)
        return self

    def setModelName(self, value):
        return self._set(modelName=value)

    def getModelName(self):
        return self.getOrDefault(self.modelName)

    def _transform(self, dataset):
        transformer = _NamedImageTransformer(
            inputCol=self.getInputCol(),
            outputCol=self._getIntermediateOutputCol(),
            modelName=self.getModelName(),
            featurize=False)
        transformed = transformer.transform(dataset)
        if self.getOrDefault(self.decodePredictions):
            return self._decodeOutputAsPredictions(transformed)
        else:
            return transformed.withColumnRenamed(
                self._getIntermediateOutputCol(), self.getOutputCol())

    def _decodeOutputAsPredictions(self, df):
        # If we start having different weights than imagenet, we'll need to
        # move this logic to individual model building in NamedImageTransformer.
        # Also, we could put the computation directly in the main computation
        # graph or use a scala UDF for potentially better performance.
        topK = self.getOrDefault(self.topK)

        def decode(predictions):
            pred_arr = np.expand_dims(np.array(predictions), axis=0)
            decoded = decode_predictions(pred_arr, top=topK)[0]
            # convert numpy dtypes to python native types
            return [(t[0], t[1], t[2].item()) for t in decoded]

        decodedSchema = ArrayType(
            StructType([
                StructField("class", StringType(), False),
                StructField("description", StringType(), False),
                StructField("probability", FloatType(), False)
            ]))
        decodeUDF = udf(decode, decodedSchema)
        interim_output = self._getIntermediateOutputCol()
        return (df.withColumn(self.getOutputCol(), decodeUDF(
            df[interim_output])).drop(interim_output))

    def _getIntermediateOutputCol(self):
        return "__tmp_" + self.getOutputCol()

Esempio n. 25

0

Mostra file

class _NamedImageTransformer(Transformer, HasInputCol, HasOutputCol):
    """
    For internal use only. NamedImagePredictor and NamedImageFeaturizer are the recommended classes
    to use.

    Applies the model specified by its popular name to the image column in DataFrame. There are
    two output modes: predictions or the featurization from the model. In either case the output
    is a MLlib Vector.
    """

    modelName = Param(
        Params._dummy(),
        "modelName",
        "A deep learning model name",
        typeConverter=SparkDLTypeConverters.supportedNameConverter(
            SUPPORTED_MODELS))
    featurize = Param(
        Params._dummy(),
        "featurize",
        "If true, output features. If false, output predictions. Either way the output is a vector.",
        typeConverter=TypeConverters.toBoolean)

    @keyword_only
    def __init__(self,
                 inputCol=None,
                 outputCol=None,
                 modelName=None,
                 featurize=False):
        """
        __init__(self, inputCol=None, outputCol=None, modelName=None, featurize=False)
        """
        super(_NamedImageTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)
        self._inputTensorName = None
        self._outputTensorName = None
        self._outputMode = None

    @keyword_only
    def setParams(self,
                  inputCol=None,
                  outputCol=None,
                  modelName=None,
                  featurize=False):
        """
        setParams(self, inputCol=None, outputCol=None, modelName=None, featurize=False)
        """
        kwargs = self._input_kwargs
        self._set(**kwargs)
        return self

    def setModelName(self, value):
        return self._set(modelName=value)

    def getModelName(self):
        return self.getOrDefault(self.modelName)

    def setFeaturize(self, value):
        return self._set(featurize=value)

    def getFeaturize(self):
        return self.getOrDefault(self.featurize)

    def _transform(self, dataset):
        modelGraphSpec = _buildTFGraphForName(self.getModelName(),
                                              self.getFeaturize())
        inputCol = self.getInputCol()
        resizedCol = "__sdl_imagesResized"
        tfTransformer = TFImageTransformer(
            inputCol=resizedCol,
            outputCol=self.getOutputCol(),
            graph=modelGraphSpec["graph"],
            inputTensor=modelGraphSpec["inputTensorName"],
            outputTensor=modelGraphSpec["outputTensorName"],
            outputMode=modelGraphSpec["outputMode"])
        resizeUdf = resizeImage(modelGraphSpec["inputTensorSize"])
        result = tfTransformer.transform(
            dataset.withColumn(resizedCol, resizeUdf(inputCol)))
        return result.drop(resizedCol)

Esempio n. 26

0

Mostra file

class TFImageTransformer(Transformer, HasInputCol, HasOutputCol,
                         HasOutputMode):
    """
    Applies the Tensorflow graph to the image column in DataFrame.

    Restrictions of the current API:

    * Does not use minibatches, which is a major low-hanging fruit for performance.
    * Only one output node can be specified.
    * The output is expected to be an image or a 1-d vector.
    * All images in the dataframe are expected be of the same numerical data type
      (i.e. the dtype of the values in the numpy array representation is the same.)

    We assume all graphs have a "minibatch" dimension (i.e. an unknown leading
    dimension) in the tensor shapes.

    .. note:: The input tensorflow graph should have appropriate weights constantified,
              since a new session is created inside this transformer.
    """

    graph = Param(Params._dummy(),
                  "graph",
                  "A TensorFlow computation graph",
                  typeConverter=SparkDLTypeConverters.toTFGraph)
    inputTensor = Param(
        Params._dummy(),
        "inputTensor",
        "A TensorFlow tensor object or name representing the input image",
        typeConverter=SparkDLTypeConverters.toTFTensorName)
    outputTensor = Param(
        Params._dummy(),
        "outputTensor",
        "A TensorFlow tensor object or name representing the output",
        typeConverter=SparkDLTypeConverters.toTFTensorName)
    channelOrder = Param(
        Params._dummy(),
        "channelOrder",
        "Strign specifying the expected color channel order, can be one of L,RGB,BGR",
        typeConverter=SparkDLTypeConverters.toChannelOrder)

    @keyword_only
    def __init__(self,
                 channelOrder,
                 inputCol=None,
                 outputCol=None,
                 graph=None,
                 inputTensor=IMAGE_INPUT_TENSOR_NAME,
                 outputTensor=None,
                 outputMode="vector"):
        """
        __init__(self, channelOrder, inputCol=None, outputCol=None, graph=None,
                 inputTensor=IMAGE_INPUT_TENSOR_NAME, outputTensor=None,
                 outputMode="vector")
          :param: channelOrder: specify the ordering of the color channel, can be one of RGB, BGR, L (grayscale)
        """
        super(TFImageTransformer, self).__init__()
        kwargs = self._input_kwargs
        self.setParams(**kwargs)
        self._setDefault(inputTensor=IMAGE_INPUT_TENSOR_NAME)
        self.channelOrder = channelOrder

    @keyword_only
    def setParams(self,
                  channelOrder=None,
                  inputCol=None,
                  outputCol=None,
                  graph=None,
                  inputTensor=IMAGE_INPUT_TENSOR_NAME,
                  outputTensor=None,
                  outputMode="vector"):
        """
        setParams(self, inputCol=None, outputCol=None, graph=None,
                  inputTensor=IMAGE_INPUT_TENSOR_NAME, outputTensor=None,
                  outputMode="vector")
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def setGraph(self, value):
        return self._set(graph=value)

    def setInputTensor(self, value):
        return self._set(inputTensor=value)

    def setOutputTensor(self, value):
        return self._set(outputTensor=value)

    def getGraph(self):
        return self.getOrDefault(self.graph)

    def getInputTensor(self):
        tensor_name = self.getOrDefault(self.inputTensor)
        return self.getGraph().get_tensor_by_name(tensor_name)

    def getOutputTensor(self):
        tensor_name = self.getOrDefault(self.outputTensor)
        return self.getGraph().get_tensor_by_name(tensor_name)

    def _transform(self, dataset):
        graph = self.getGraph()
        composed_graph = self._addReshapeLayers(graph,
                                                self._getImageDtype(dataset))
        final_graph = self._stripGraph(composed_graph)
        with final_graph.as_default():
            image = dataset[self.getInputCol()]
            image_df_exploded = (dataset.withColumn(
                "__sdl_image_height", image.height).withColumn(
                    "__sdl_image_width", image.width).withColumn(
                        "__sdl_image_nchannels",
                        image.nChannels).withColumn("__sdl_image_data",
                                                    image.data))

            final_output_name = self._getFinalOutputTensorName()
            output_tensor = final_graph.get_tensor_by_name(final_output_name)
            final_df = (tfs.map_rows(
                [output_tensor],
                image_df_exploded,
                feed_dict={
                    "height": "__sdl_image_height",
                    "width": "__sdl_image_width",
                    "num_channels": "__sdl_image_nchannels",
                    "image_buffer": "__sdl_image_data"
                }).drop("__sdl_image_height", "__sdl_image_width",
                        "__sdl_image_nchannels", "__sdl_image_data"))

            tfs_output_name = tfx.op_name(output_tensor, final_graph)
            original_output_name = self._getOriginalOutputTensorName()
            output_shape = final_graph.get_tensor_by_name(
                original_output_name).shape
            output_mode = self.getOrDefault(self.outputMode)
            # TODO: support non-1d tensors (return np.array).
            if output_mode == "image":
                return self._convertOutputToImage(final_df, tfs_output_name,
                                                  output_shape)
            else:
                assert output_mode == "vector", "Unknown output mode: %s" % output_mode
                return self._convertOutputToVector(final_df, tfs_output_name)

    def _getImageDtype(self, dataset):
        # This may not be the best way to get the type of image, but it is one way.
        # Assumes that the dtype for all images is the same in the given dataframe.
        pdf = dataset.select(self.getInputCol()).take(1)
        img = pdf[0][self.getInputCol()]
        img_type = imageIO.imageTypeByOrdinal(img.mode)
        return img_type.dtype

    # TODO: duplicate code, same functionality as sparkdl.graph.pieces.py::builSpImageConverter
    # TODO: It should be extracted as a util function and shared
    def _addReshapeLayers(self, tf_graph, dtype="uint8"):
        input_tensor_name = self.getInputTensor().name

        gdef = tf_graph.as_graph_def(add_shapes=True)
        g = tf.Graph()
        with g.as_default():
            # Flat image data -> image dimensions
            height = tf.placeholder(tf.int32, [], name="height")
            width = tf.placeholder(tf.int32, [], name="width")
            num_channels = tf.placeholder(tf.int32, [], name="num_channels")
            image_buffer = tf.placeholder(tf.string, [], name="image_buffer")
            # Note: the shape argument is required for tensorframes as it uses a
            # slightly older version of tensorflow.
            shape = tf.reshape(tf.stack([height, width, num_channels], axis=0),
                               shape=(3, ),
                               name='shape')
            if dtype == "uint8":
                image_uint8 = tf.decode_raw(image_buffer,
                                            tf.uint8,
                                            name="decode_raw")
                image_float = tf.to_float(image_uint8)
            else:
                assert dtype == "float32", "Unsupported dtype for image: %s" % dtype
                image_float = tf.decode_raw(image_buffer,
                                            tf.float32,
                                            name="decode_raw")
            image_reshaped = tf.reshape(image_float, shape, name="reshaped")
            image_reshaped = imageIO.fixColorChannelOrdering(
                self.channelOrder, image_reshaped)
            image_reshaped_expanded = tf.expand_dims(image_reshaped,
                                                     0,
                                                     name="expanded")

            # Add on the original graph
            tf.import_graph_def(
                gdef,
                input_map={input_tensor_name: image_reshaped_expanded},
                return_elements=[self.getOutputTensor().name],
                name=USER_GRAPH_NAMESPACE)

            # Flatten the output for tensorframes
            output_node = g.get_tensor_by_name(
                self._getOriginalOutputTensorName())
            _ = tf.reshape(
                output_node[0],  # batch-size = 1,
                shape=[-1],
                name=self._getFinalOutputOpName())
        return g

    # Sometimes the tf graph contains a bunch of stuff that doesn't lead to the
    # output. TensorFrames does not like that, so we strip out the parts that
    # are not necessary for the computation at hand.
    def _stripGraph(self, tf_graph):
        gdef = tfx.strip_and_freeze_until([self._getFinalOutputOpName()],
                                          tf_graph)
        g = tf.Graph()
        with g.as_default():
            tf.import_graph_def(gdef, name='')
        return g

    def _getOriginalOutputTensorName(self):
        return USER_GRAPH_NAMESPACE + '/' + self.getOutputTensor().name

    def _getFinalOutputTensorName(self):
        return NEW_OUTPUT_PREFIX + '_' + self.getOutputTensor().name

    def _getFinalOutputOpName(self):
        return tfx.op_name(self._getFinalOutputTensorName())

    def _convertOutputToImage(self, df, tfs_output_col, output_shape):
        assert len(output_shape
                   ) == 4, str(output_shape) + " does not have 4 dimensions"
        height = int(output_shape[1])
        width = int(output_shape[2])

        def to_image(orig_image, numeric_data):
            # Assume the returned image has float pixels but same #channels as input
            mode = imageIO.imageTypeByName('CV_32FC%d' % orig_image.nChannels)
            data = bytearray(
                np.array(numeric_data).astype(np.float32).tobytes())
            nChannels = orig_image.nChannels
            return Row(origin="",
                       mode=mode.ord,
                       height=height,
                       width=width,
                       nChannels=nChannels,
                       data=data)

        to_image_udf = udf(to_image, ImageSchema.imageSchema['image'].dataType)
        resDf = df.withColumn(
            self.getOutputCol(),
            to_image_udf(df[self.getInputCol()], df[tfs_output_col]))
        return resDf.drop(tfs_output_col)

    def _convertOutputToVector(self, df, tfs_output_col):
        """
        Converts the output python list to MLlib Vector.
        """
        return (df.withColumn(self.getOutputCol(),
                              JVMAPI.listToMLlibVectorUDF(
                                  df[tfs_output_col])).drop(tfs_output_col))

Esempio n. 27

0

Mostra file

 def __init__(self):
     super(DummyParams, self).__init__()
     self.test_param = Param(self, "test_param",
                             "dummy parameter for testing")
     self.another_test_param = Param(self, "another_test_param",
                                     "second parameter for testing")

Esempio n. 28

0

Mostra file

File: evaluation.py Progetto: zhangxiangnan/spark

class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol,
                                        HasPredictionCol):
    """
    .. note:: Experimental

    Evaluator for Multiclass Classification, which expects two input
    columns: prediction and label.

    >>> scoreAndLabels = [(0.0, 0.0), (0.0, 1.0), (0.0, 0.0),
    ...     (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]
    >>> dataset = spark.createDataFrame(scoreAndLabels, ["prediction", "label"])
    ...
    >>> evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    >>> evaluator.evaluate(dataset)
    0.66...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "precision"})
    0.66...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "recall"})
    0.66...

    .. versionadded:: 1.5.0
    """
    metricName = Param(
        Params._dummy(),
        "metricName", "metric name in evaluation "
        "(f1|precision|recall|weightedPrecision|weightedRecall|accuracy)",
        typeConverter=TypeConverters.toString)

    @keyword_only
    def __init__(self,
                 predictionCol="prediction",
                 labelCol="label",
                 metricName="f1"):
        """
        __init__(self, predictionCol="prediction", labelCol="label", \
                 metricName="f1")
        """
        super(MulticlassClassificationEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator",
            self.uid)
        self._setDefault(predictionCol="prediction",
                         labelCol="label",
                         metricName="f1")
        kwargs = self.__init__._input_kwargs
        self._set(**kwargs)

    @since("1.5.0")
    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        return self._set(metricName=value)

    @since("1.5.0")
    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @keyword_only
    @since("1.5.0")
    def setParams(self,
                  predictionCol="prediction",
                  labelCol="label",
                  metricName="f1"):
        """
        setParams(self, predictionCol="prediction", labelCol="label", \
                  metricName="f1")
        Sets params for multiclass classification evaluator.
        """
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)

Esempio n. 29

0

Mostra file

File: evaluation.py Progetto: zhangxiangnan/spark

class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol):
    """
    .. note:: Experimental

    Evaluator for Regression, which expects two input
    columns: prediction and label.

    >>> scoreAndLabels = [(-28.98343821, -27.0), (20.21491975, 21.5),
    ...   (-25.98418959, -22.0), (30.69731842, 33.0), (74.69283752, 71.0)]
    >>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"])
    ...
    >>> evaluator = RegressionEvaluator(predictionCol="raw")
    >>> evaluator.evaluate(dataset)
    2.842...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "r2"})
    0.993...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"})
    2.649...

    .. versionadded:: 1.4.0
    """
    metricName = Param(Params._dummy(),
                       "metricName",
                       """metric name in evaluation - one of:
                       rmse - root mean squared error (default)
                       mse - mean squared error
                       r2 - r^2 metric
                       mae - mean absolute error.""",
                       typeConverter=TypeConverters.toString)

    @keyword_only
    def __init__(self,
                 predictionCol="prediction",
                 labelCol="label",
                 metricName="rmse"):
        """
        __init__(self, predictionCol="prediction", labelCol="label", \
                 metricName="rmse")
        """
        super(RegressionEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
        self._setDefault(predictionCol="prediction",
                         labelCol="label",
                         metricName="rmse")
        kwargs = self.__init__._input_kwargs
        self._set(**kwargs)

    @since("1.4.0")
    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        return self._set(metricName=value)

    @since("1.4.0")
    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @keyword_only
    @since("1.4.0")
    def setParams(self,
                  predictionCol="prediction",
                  labelCol="label",
                  metricName="rmse"):
        """
        setParams(self, predictionCol="prediction", labelCol="label", \
                  metricName="rmse")
        Sets params for regression evaluator.
        """
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)

Esempio n. 30

0

Mostra file

File: evaluation.py Progetto: zhangxiangnan/spark

class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol,
                                    HasRawPredictionCol):
    """
    .. note:: Experimental

    Evaluator for binary classification, which expects two input columns: rawPrediction and label.
    The rawPrediction column can be of type double (binary 0/1 prediction, or probability of label
    1) or of type vector (length-2 vector of raw predictions, scores, or label probabilities).

    >>> from pyspark.ml.linalg import Vectors
    >>> scoreAndLabels = map(lambda x: (Vectors.dense([1.0 - x[0], x[0]]), x[1]),
    ...    [(0.1, 0.0), (0.1, 1.0), (0.4, 0.0), (0.6, 0.0), (0.6, 1.0), (0.6, 1.0), (0.8, 1.0)])
    >>> dataset = spark.createDataFrame(scoreAndLabels, ["raw", "label"])
    ...
    >>> evaluator = BinaryClassificationEvaluator(rawPredictionCol="raw")
    >>> evaluator.evaluate(dataset)
    0.70...
    >>> evaluator.evaluate(dataset, {evaluator.metricName: "areaUnderPR"})
    0.83...

    .. versionadded:: 1.4.0
    """

    metricName = Param(Params._dummy(),
                       "metricName",
                       "metric name in evaluation (areaUnderROC|areaUnderPR)",
                       typeConverter=TypeConverters.toString)

    @keyword_only
    def __init__(self,
                 rawPredictionCol="rawPrediction",
                 labelCol="label",
                 metricName="areaUnderROC"):
        """
        __init__(self, rawPredictionCol="rawPrediction", labelCol="label", \
                 metricName="areaUnderROC")
        """
        super(BinaryClassificationEvaluator, self).__init__()
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.evaluation.BinaryClassificationEvaluator",
            self.uid)
        self._setDefault(rawPredictionCol="rawPrediction",
                         labelCol="label",
                         metricName="areaUnderROC")
        kwargs = self.__init__._input_kwargs
        self._set(**kwargs)

    @since("1.4.0")
    def setMetricName(self, value):
        """
        Sets the value of :py:attr:`metricName`.
        """
        return self._set(metricName=value)

    @since("1.4.0")
    def getMetricName(self):
        """
        Gets the value of metricName or its default value.
        """
        return self.getOrDefault(self.metricName)

    @keyword_only
    @since("1.4.0")
    def setParams(self,
                  rawPredictionCol="rawPrediction",
                  labelCol="label",
                  metricName="areaUnderROC"):
        """
        setParams(self, rawPredictionCol="rawPrediction", labelCol="label", \
                  metricName="areaUnderROC")
        Sets params for binary classification evaluator.
        """
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)