class _MinMaxScalerParams(JavaWithParams, HasInputCol, HasOutputCol): """ Params for :class:`MinMaxScaler`. """ MIN: Param[float] = FloatParam("min", "Lower bound of the output feature range.", 0.0, ParamValidators.not_null()) MAX: Param[float] = FloatParam("max", "Upper bound of the output feature range.", 1.0, ParamValidators.not_null()) def __init__(self, java_params): super(_MinMaxScalerParams, self).__init__(java_params) def set_min(self, value: float): return typing.cast(_MinMaxScalerParams, self.set(self.MIN, value)) def set_max(self, value: float): return typing.cast(_MinMaxScalerParams, self.set(self.MAX, value)) def get_min(self) -> bool: return self.get(self.MIN) def get_max(self) -> bool: return self.get(self.MAX) @property def min(self): return self.get_min() @property def max(self): return self.get_max()
class _StringIndexerParams(_StringIndexerModelParams): """ Params for :class:`StringIndexer`. """ STRING_ORDER_TYPE: Param[str] = StringParam( "string_order_type", "How to order strings of each column.", "arbitrary", ParamValidators.in_array([ 'arbitrary', 'frequencyDesc', 'frequencyAsc', 'alphabetDesc', 'alphabetAsc' ])) def __init__(self, java_params): super(_StringIndexerParams, self).__init__(java_params) def set_string_order_type(self, value: str): return typing.cast(_StringIndexerParams, self.set(self.STRING_ORDER_TYPE, value)) def get_string_order_type(self) -> str: return self.get(self.STRING_ORDER_TYPE) @property def string_order_type(self): return self.get_string_order_type()
class HasMultiClass(WithParams, ABC): """ Base class for the shared multi class param. Supported options: <li>auto: selects the classification type based on the number of classes: If the number of unique label values from the input data is one or two, set to "binomial". Otherwise, set to "multinomial". <li>binomial: binary logistic regression. <li>multinomial: multinomial logistic regression. """ MULTI_CLASS: Param[str] = StringParam( "multi_class", "Classification type. Supported options: 'auto', 'binomial' and 'multinomial'.", 'auto', ParamValidators.in_array(['auto', 'binomial', 'multinomial'])) def set_multi_class(self, class_type: str): return self.set(self.MULTI_CLASS, class_type) def get_multi_class(self) -> str: return self.get(self.MULTI_CLASS) @property def multi_class(self) -> str: return self.get_multi_class()
def test_validators(self): gt = ParamValidators.gt(10) self.assertFalse(gt.validate(None)) self.assertFalse(gt.validate(5)) self.assertFalse(gt.validate(10)) self.assertTrue(gt.validate(15)) gt_eq = ParamValidators.gt_eq(10) self.assertFalse(gt_eq.validate(None)) self.assertFalse(gt_eq.validate(5)) self.assertTrue(gt_eq.validate(10)) self.assertTrue(gt_eq.validate(15)) lt = ParamValidators.lt(10) self.assertFalse(lt.validate(None)) self.assertTrue(lt.validate(5)) self.assertFalse(lt.validate(10)) self.assertFalse(lt.validate(15)) lt_eq = ParamValidators.lt_eq(10) self.assertFalse(lt_eq.validate(None)) self.assertTrue(lt_eq.validate(5)) self.assertTrue(lt_eq.validate(10)) self.assertFalse(lt_eq.validate(15)) in_range_inclusive = ParamValidators.in_range(5, 15) self.assertFalse(in_range_inclusive.validate(None)) self.assertFalse(in_range_inclusive.validate(0)) self.assertTrue(in_range_inclusive.validate(5)) self.assertTrue(in_range_inclusive.validate(10)) self.assertTrue(in_range_inclusive.validate(15)) self.assertFalse(in_range_inclusive.validate(20)) in_range_exclusive = ParamValidators.in_range(5, 15, False, False) self.assertFalse(in_range_exclusive.validate(None)) self.assertFalse(in_range_exclusive.validate(0)) self.assertFalse(in_range_exclusive.validate(5)) self.assertTrue(in_range_exclusive.validate(10)) self.assertFalse(in_range_exclusive.validate(15)) self.assertFalse(in_range_exclusive.validate(20)) in_array = ParamValidators.in_array([1, 2, 3]) self.assertFalse(in_array.validate(None)) self.assertTrue(in_array.validate(1)) self.assertFalse(in_array.validate(0)) not_null = ParamValidators.not_null() self.assertTrue(not_null.validate(5)) self.assertFalse(not_null.validate(None))
class HasBatchStrategy(WithParams, ABC): """ Base class for the shared batch strategy param. """ BATCH_STRATEGY: Param[str] = StringParam( "batch_strategy", "Strategy to create mini batch from online train data.", "count", ParamValidators.in_array(["count"])) def get_batch_strategy(self) -> str: return self.get(self.BATCH_STRATEGY) @property def batch_strategy(self): return self.get_batch_strategy()
class HasOutputCol(WithParams, ABC): """ Base class for the shared output_col param. """ OUTPUT_COL: Param[str] = StringParam("output_col", "Output column name.", "output", ParamValidators.not_null()) def set_output_col(self, col: str): return self.set(self.OUTPUT_COL, col) def get_output_col(self) -> str: return self.get(self.OUTPUT_COL) @property def output_col(self) -> str: return self.get_output_col()
class HasLabelCol(WithParams, ABC): """ Base class for the shared label column param. """ LABEL_COL: Param[str] = StringParam("label_col", "Label column name.", "label", ParamValidators.not_null()) def set_label_col(self, col: str): return self.set(self.LABEL_COL, col) def get_label_col(self) -> str: return self.get(self.LABEL_COL) @property def label_col(self) -> str: return self.get_label_col()
class HasInputCol(WithParams, ABC): """ Base class for the shared input col param. """ INPUT_COL: Param[str] = StringParam("input_col", "Input column name.", "input", ParamValidators.not_null()) def set_input_col(self, col: str): return self.set(self.INPUT_COL, col) def get_input_col(self) -> str: return self.get(self.INPUT_COL) @property def input_col(self) -> str: return self.get_input_col()
class HasReg(WithParams, ABC): """ Base class for the shared regularization param. """ REG: Param[float] = FloatParam("reg", "Regularization parameter.", 0., ParamValidators.gt_eq(0.)) def set_reg(self, value: float): return self.set(self.REG, value) def get_reg(self) -> float: return self.get(self.REG) @property def reg(self) -> float: return self.get_reg()
class HasOutputCols(WithParams, ABC): """ Base class for the shared output_cols param. """ OUTPUT_COLS: Param[Tuple[str, ...]] = StringArrayParam( "output_cols", "Output column names.", None, ParamValidators.non_empty_array()) def set_output_cols(self, *cols: str): return self.set(self.OUTPUT_COLS, cols) def get_output_cols(self) -> Tuple[str, ...]: return self.get(self.OUTPUT_COLS) @property def output_cols(self) -> Tuple[str, ...]: return self.get_output_cols()
class HasMaxIter(WithParams, ABC): """ Base class for the shared maxIter param. """ MAX_ITER: Param[int] = IntParam("max_iter", "Maximum number of iterations.", 20, ParamValidators.gt(0)) def set_max_iter(self, max_iter: int): return self.set(self.MAX_ITER, max_iter) def get_max_iter(self) -> int: return self.get(self.MAX_ITER) @property def max_iter(self) -> int: return self.get_max_iter()
class HasGlobalBatchSize(WithParams, ABC): """ Base class for the shared global_batch_size param. """ GLOBAL_BATCH_SIZE: Param[int] = IntParam( "global_batch_size", "Global batch size of training algorithms.", 32, ParamValidators.gt(0)) def set_global_batch_size(self, global_batch_size: int): return self.set(self.GLOBAL_BATCH_SIZE, global_batch_size) def get_global_batch_size(self) -> int: return self.get(self.GLOBAL_BATCH_SIZE) @property def global_batch_size(self) -> int: return self.get_global_batch_size()
class HasFeaturesCol(WithParams, ABC): """ Base class for the shared feature_col param. """ FEATURES_COL: Param[str] = StringParam("features_col", "Features column name.", "features", ParamValidators.not_null()) def set_features_col(self, col): return self.set(self.FEATURES_COL, col) def get_features_col(self) -> str: return self.get(self.FEATURES_COL) @property def features_col(self) -> str: return self.get_features_col()
class HasElasticNet(WithParams, ABC): """ Base class for the shared decay factor param. """ ELASTIC_NET: Param[float] = FloatParam("elastic_net", "ElasticNet parameter.", 0., ParamValidators.in_range(0.0, 1.0)) def set_elastic_net(self, value: float): return self.set(self.ELASTIC_NET, value) def get_elastic_net(self) -> float: return self.get(self.ELASTIC_NET) @property def elastic_net(self): return self.get(self.ELASTIC_NET)
class HasDecayFactor(WithParams, ABC): """ Base class for the shared decay factor param. """ DECAY_FACTOR: Param[float] = FloatParam( "decay_factor", "The forgetfulness of the previous centroids.", 0., ParamValidators.in_range(0, 1)) def set_decay_factor(self, value: float): return self.set(self.DECAY_FACTOR, value) def get_decay_factor(self) -> float: return self.get(self.DECAY_FACTOR) @property def decay_factor(self): return self.get(self.DECAY_FACTOR)
class HasTol(WithParams, ABC): """ Base class for the shared tolerance param. """ TOL: Param[float] = FloatParam( "tol", "Convergence tolerance for iterative algorithms.", 1e-6, ParamValidators.gt_eq(0)) def set_tol(self, value: float): return self.set(self.TOL, value) def get_tol(self) -> float: return self.get(self.TOL) @property def tol(self) -> float: return self.get_tol()
class HasDistanceMeasure(WithParams, ABC): """ Base class for the shared distance_measure param. """ DISTANCE_MEASURE: Param[str] = StringParam( "distance_measure", "Distance measure. Supported options: 'euclidean' and 'cosine'.", "euclidean", ParamValidators.in_array(['euclidean', 'cosine'])) def set_distance_measure(self, distance_measure: str): return self.set(self.DISTANCE_MEASURE, distance_measure) def get_distance_measure(self) -> str: return self.get(self.DISTANCE_MEASURE) @property def distance_measure(self) -> str: return self.get_distance_measure()
class HasLearningRate(WithParams, ABC): """ Base class for the shared learning rate param. """ LEARNING_RATE: Param[float] = FloatParam( "learning_rate", "Learning rate of optimization method.", 0.1, ParamValidators.gt(0)) def set_learning_rate(self, learning_rate: float): return self.set(self.LEARNING_RATE, learning_rate) def get_learning_rate(self) -> float: return self.get(self.LEARNING_RATE) @property def learning_rate(self) -> float: return self.get_learning_rate()
class HasPredictionCol(WithParams, ABC): """ Base class for the shared prediction column param. """ PREDICTION_COL: Param[str] = StringParam("prediction_col", "Prediction column name.", "prediction", ParamValidators.not_null()) def set_prediction_col(self, col: str): return self.set(self.PREDICTION_COL, col) def get_prediction_col(self) -> str: return self.get(self.PREDICTION_COL) @property def prediction_col(self) -> str: return self.get_prediction_col()
class _KNNModelParams(JavaWithParams, HasFeaturesCol, HasPredictionCol, ABC): """ Params for :class:`KNNModel`. """ K: Param[int] = IntParam("k", "The number of nearest neighbors", 5, ParamValidators.gt(0)) def __init__(self, java_params): super(_KNNModelParams, self).__init__(java_params) def set_k(self, value: int): return typing.cast(_KNNModelParams, self.set(self.K, value)) def get_k(self) -> int: return self.get(self.K) @property def k(self) -> int: return self.get_k()
class _KMeansParams(_KMeansModelParams, HasSeed, HasMaxIter): """ Params for :class:`KMeans`. """ INIT_MODE: Param[str] = StringParam( "init_mode", "The initialization algorithm. Supported options: 'random'.", "random", ParamValidators.in_array(["random"])) def __init__(self, java_params): super(_KMeansParams, self).__init__(java_params) def set_init_mode(self, value: str): return self.set(self.INIT_MODE, value) def get_init_mode(self) -> str: return self.get(self.INIT_MODE) @property def init_mode(self): return self.get_init_mode()
class _KMeansModelParams(JavaWithParams, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol, ABC): """ Params for :class:`KMeansModel`. """ K: Param[int] = IntParam("k", "The max number of clusters to create.", 2, ParamValidators.gt(1)) def __init__(self, java_params): super(_KMeansModelParams, self).__init__(java_params) def set_k(self, value: int): return typing.cast(_KMeansModelParams, self.set(self.K, value)) def get_k(self) -> int: return self.get(self.K) @property def k(self) -> int: return self.get_k()
class _NaiveBayesModelParams(JavaWithParams, HasFeaturesCol, HasPredictionCol, ABC): """ Params for :class:`NaiveBayesModel`. """ MODEL_TYPE: Param[str] = StringParam( "model_type", "The model type.", "multinomial", ParamValidators.in_array(["multinomial"])) def __init__(self, java_params): super(_NaiveBayesModelParams, self).__init__(java_params) def set_model_type(self, value: str): return self.set(self.MODEL_TYPE, value) def get_model_type(self) -> str: return self.get(self.MODEL_TYPE) @property def model_type(self) -> str: return self.get_model_type()
class _LinearSVCModelParams(JavaWithParams, HasFeaturesCol, HasPredictionCol, HasRawPredictionCol, ABC): """ Params for :class:`LinearSVCModel`. """ THRESHOLD: Param[float] = FloatParam( "threshold", "Threshold in binary classification prediction applied to rawPrediction.", 0.0, ParamValidators.not_null()) def __init__(self, java_params): super(_LinearSVCModelParams, self).__init__(java_params) def set_threshold(self, value: int): return typing.cast(_LinearSVCModelParams, self.set(self.THRESHOLD, value)) def get_threshold(self) -> int: return self.get(self.THRESHOLD) @property def threshold(self) -> int: return self.get_threshold()
class _NaiveBayesParams( _NaiveBayesModelParams, HasLabelCol, ): """ Params for :class:`NaiveBayes`. """ SMOOTHING: Param[float] = FloatParam("smoothing", "The smoothing parameter.", 1.0, ParamValidators.gt_eq(0.0)) def __init__(self, java_params): super(_NaiveBayesParams, self).__init__(java_params) def set_smoothing(self, value: float): return typing.cast(_NaiveBayesParams, self.set(self.SMOOTHING, value)) def get_smoothing(self) -> float: return self.get(self.SMOOTHING) @property def smoothing(self) -> float: return self.get_smoothing()
class HasHandleInvalid(WithParams, ABC): """ Base class for the shared handle_invalid param. Supported options and the corresponding behavior to handle invalid entries is listed as follows. <ul> <li>error: raise an exception. <li>skip: filter out rows with bad values. </ul> """ HANDLE_INVALID: Param[str] = StringParam( "handle_invalid", "Strategy to handle invalid entries.", "error", ParamValidators.in_array(['error', 'skip'])) def set_handle_invalid(self, value: str): return self.set(self.HANDLE_INVALID, value) def get_handle_invalid(self) -> str: return self.get(self.HANDLE_INVALID) @property def handle_invalid(self) -> str: return self.get_handle_invalid()
class _BinaryClassificationEvaluatorParams(JavaWithParams, HasLabelCol, HasRawPredictionCol, HasWeightCol): """ Params for :class:`BinaryClassificationEvaluator`. """ METRICS_NAMES: Param[Tuple[str, ...]] = StringArrayParam( "metrics_names", "Names of output metrics.", ["areaUnderROC", "areaUnderPR"], ParamValidators.is_sub_set( ["areaUnderROC", "areaUnderPR", "areaUnderLorenz", "ks"])) def __init__(self, java_params): super(_BinaryClassificationEvaluatorParams, self).__init__(java_params) def set_metrics_names(self, *value: str): return self.set(self.METRICS_NAMES, value) def get_metrics_names(self) -> Tuple[str, ...]: return self.get(self.METRICS_NAMES) @property def metrics_names(self) -> Tuple[str, ...]: return self.get_metrics_names()
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ import os from typing import Dict, Any from pyflink.table import StreamTableEnvironment from pyflink.ml.core.api import Stage from pyflink.ml.core.param import ParamValidators, Param, BooleanParam, IntParam, \ FloatParam, StringParam, IntArrayParam, FloatArrayParam, StringArrayParam from pyflink.ml.tests.test_utils import PyFlinkMLTestCase BOOLEAN_PARAM = BooleanParam("boolean_param", "Description", False) INT_PARAM = IntParam("int_param", "Description", 1, ParamValidators.lt(100)) FLOAT_PARAM = FloatParam("float_param", "Description", 3.0, ParamValidators.lt(100)) STRING_PARAM = StringParam('string_param', "Description", "5") INT_ARRAY_PARAM = IntArrayParam("int_array_param", "Description", (6, 7)) FLOAT_ARRAY_PARAM = FloatArrayParam("float_array_param", "Description", (10.0, 11.0)) STRING_ARRAY_PARAM = StringArrayParam("string_array_param", "Description", ("14", "15")) EXTRA_INT_PARAM = IntParam("extra_int_param", "Description", 20, ParamValidators.always_true()) PARAM_WITH_NONE_DEFAULT = IntParam( "param_with_none_default", "Must be explicitly set with a non-none value", None, ParamValidators.not_null())