class ColumnPruner(H2OStageBase, JavaTransformer): keep = Param(Params._dummy(), "keep", "keep the specified columns in the frame", H2OTypeConverters.toBoolean()) columns = Param(Params._dummy(), "columns", "specified columns", H2OTypeConverters.toListString()) @keyword_only def __init__(self, keep=False, columns=[]): Initializer.load_sparkling_jar() super(ColumnPruner, self).__init__() self._java_obj = self._new_java_obj( "ai.h2o.sparkling.ml.features.ColumnPruner", self.uid) self._setDefaultValuesFromJava() kwargs = Utils.getInputKwargs(self) self._set(**kwargs) def setKeep(self, value): return self._set(keep=value) def setColumns(self, value): return self._set(columns=value) def getKeep(self): return self.getOrDefault(self.keep) def getColumns(self): return self.getOrDefault(self.columns)
class H2OAutoMLParams(H2OCommonSupervisedParams, HasMonotoneConstraints): ## # Param definitions ## ignoredCols = Param( Params._dummy(), "ignoredCols", "Ignored column names", H2OTypeConverters.toListString()) includeAlgos = Param( Params._dummy(), "includeAlgos", "Algorithms to include when using automl", H2OTypeConverters.toEnumListString("ai.h2o.automl.Algo")) excludeAlgos = Param( Params._dummy(), "excludeAlgos", "Algorithms to exclude when using automl", H2OTypeConverters.toEnumListString("ai.h2o.automl.Algo")) projectName = Param( Params._dummy(), "projectName", "identifier for models that should be grouped together in the leaderboard " "(e.g., airlines and iris)", H2OTypeConverters.toNullableString()) maxRuntimeSecs = Param( Params._dummy(), "maxRuntimeSecs", "Maximum time in seconds for automl to be running", H2OTypeConverters.toFloat()) stoppingRounds = Param( Params._dummy(), "stoppingRounds", "Stopping rounds", H2OTypeConverters.toInt()) stoppingTolerance = Param( Params._dummy(), "stoppingTolerance", "Stopping tolerance", H2OTypeConverters.toFloat()) stoppingMetric = Param( Params._dummy(), "stoppingMetric", "Stopping metric", H2OTypeConverters.toEnumString("hex.ScoreKeeper$StoppingMetric")) sortMetric = Param( Params._dummy(), "sortMetric", "Sort metric for the AutoML leaderboard", H2OTypeConverters.toEnumString("ai.h2o.sparkling.ml.algos.H2OAutoMLSortMetric")) balanceClasses = Param( Params._dummy(), "balanceClasses", "Balance classes", H2OTypeConverters.toBoolean()) classSamplingFactors = Param( Params._dummy(), "classSamplingFactors", "Class sampling factors", H2OTypeConverters.toNullableListFloat()) maxAfterBalanceSize = Param( Params._dummy(), "maxAfterBalanceSize", "Max after balance size", H2OTypeConverters.toFloat()) keepCrossValidationPredictions = Param( Params._dummy(), "keepCrossValidationPredictions", "Keep cross validation predictions", H2OTypeConverters.toBoolean()) keepCrossValidationModels = Param( Params._dummy(), "keepCrossValidationModels", "Keep cross validation models", H2OTypeConverters.toBoolean()) maxModels = Param( Params._dummy(), "maxModels", "Max models to train in AutoML", H2OTypeConverters.toInt()) ## # Getters ## def getIgnoredCols(self): return self.getOrDefault(self.ignoredCols) def getTryMutations(self): return self.getOrDefault(self.tryMutations) def getExcludeAlgos(self): return self.getOrDefault(self.excludeAlgos) def getIncludeAlgos(self): return self.getOrDefault(self.includeAlgos) def getProjectName(self): return self.getOrDefault(self.projectName) def getMaxRuntimeSecs(self): return self.getOrDefault(self.maxRuntimeSecs) def getStoppingRounds(self): return self.getOrDefault(self.stoppingRounds) def getStoppingTolerance(self): return self.getOrDefault(self.stoppingTolerance) def getStoppingMetric(self): return self.getOrDefault(self.stoppingMetric) def getSortMetric(self): return self.getOrDefault(self.sortMetric) def getBalanceClasses(self): return self.getOrDefault(self.balanceClasses) def getClassSamplingFactors(self): return self.getOrDefault(self.classSamplingFactors) def getMaxAfterBalanceSize(self): return self.getOrDefault(self.maxAfterBalanceSize) def getKeepCrossValidationPredictions(self): return self.getOrDefault(self.keepCrossValidationPredictions) def getKeepCrossValidationModels(self): return self.getOrDefault(self.keepCrossValidationModels) def getMaxModels(self): return self.getOrDefault(self.maxModels) ## # Setters ## def setIgnoredCols(self, value): return self._set(ignoredCols=value) def setTryMutations(self, value): return self._set(tryMutations=value) def setIncludeAlgos(self, value): return self._set(includeAlgos=value) def setExcludeAlgos(self, value): return self._set(excludeAlgos=value) def setProjectName(self, value): return self._set(projectName=value) def setMaxRuntimeSecs(self, value): return self._set(maxRuntimeSecs=value) def setStoppingRounds(self, value): return self._set(stoppingRounds=value) def setStoppingTolerance(self, value): return self._set(stoppingTolerance=value) def setStoppingMetric(self, value): return self._set(stoppingMetric=value) def setSortMetric(self, value): return self._set(sortMetric=value) def setBalanceClasses(self, value): return self._set(balanceClasses=value) def setClassSamplingFactors(self, value): return self._set(classSamplingFactors=value) def setMaxAfterBalanceSize(self, value): return self._set(maxAfterBalanceSize=value) def setKeepCrossValidationPredictions(self, value): return self._set(keepCrossValidationPredictions=value) def setKeepCrossValidationModels(self, value): return self._set(keepCrossValidationModels=value) def setMaxModels(self, value): return self._set(maxModels=value)
class H2OCommonParams(H2OBaseMOJOParams): ## # Param definitions ## validationDataFrame = Param( Params._dummy(), "validationDataFrame", "A data frame dedicated for a validation of the trained model. If the parameters is not set," + "a validation frame created via the 'splitRatio' parameter.", H2OTypeConverters.toNullableDataFrame()) splitRatio = Param( Params._dummy(), "splitRatio", "Accepts values in range [0, 1.0] which determine how large part of dataset is used for training" " and for validation. For example, 0.8 -> 80% training 20% validation.", H2OTypeConverters.toFloat()) columnsToCategorical = Param( Params._dummy(), "columnsToCategorical", "List of columns to convert to categorical before modelling", H2OTypeConverters.toListString()) ## # Getters ## def getValidationDataFrame(self): return self.getOrDefault(self.validationDataFrame) def getSplitRatio(self): return self.getOrDefault(self.splitRatio) def getColumnsToCategorical(self): return self.getOrDefault(self.columnsToCategorical) ## # Setters ## def setValidationDataFrame(self, value): return self._set(validationDataFrame=value) def setSplitRatio(self, value): return self._set(splitRatio=value) def setColumnsToCategorical(self, value, *args): assert_is_type(value, [str], str) if isinstance(value, str): prepared_array = [value] else: prepared_array = value for arg in args: prepared_array.append(arg) return self._set(columnsToCategorical=value) # Setters for parameters which are defined on MOJO as well def setPredictionCol(self, value): return self._set(predictionCol=value) def setDetailedPredictionCol(self, value): return self._set(detailedPredictionCol=value) def setWithDetailedPredictionCol(self, value): warnings.warn( "The method will be removed without a replacement in the version 3.34." "Detailed prediction columns is enabled by default.", DeprecationWarning) return self def setFeaturesCols(self, value): return self._set(featuresCols=value) def setConvertUnknownCategoricalLevelsToNa(self, value): return self._set(convertUnknownCategoricalLevelsToNa=value) def setConvertInvalidNumbersToNa(self, value): return self._set(convertInvalidNumbersToNa=value) def setNamedMojoOutputColumns(self, value): return self._set(namedMojoOutputColumns=value) def setWithContributions(self, value): return self._set(withContributions=value) def setWithLeafNodeAssignments(self, value): return self._set(withLeafNodeAssignments=value) def setWithStageResults(self, value): return self._set(withStageResults=value)
class H2OCommonParams(H2OMOJOAlgoSharedParams): foldCol = Param(Params._dummy(), "foldCol", "Fold column name", H2OTypeConverters.toNullableString()) weightCol = Param(Params._dummy(), "weightCol", "Weight column name", H2OTypeConverters.toNullableString()) splitRatio = Param( Params._dummy(), "splitRatio", "Accepts values in range [0, 1.0] which determine how large part of dataset is used for training" " and for validation. For example, 0.8 -> 80% training 20% validation.", H2OTypeConverters.toFloat()) seed = Param(Params._dummy(), "seed", "Used to specify seed to reproduce the model run", H2OTypeConverters.toInt()) nfolds = Param(Params._dummy(), "nfolds", "Number of fold columns", H2OTypeConverters.toInt()) allStringColumnsToCategorical = Param( Params._dummy(), "allStringColumnsToCategorical", "Transform all strings columns to categorical", H2OTypeConverters.toBoolean()) columnsToCategorical = Param( Params._dummy(), "columnsToCategorical", "List of columns to convert to categorical before modelling", H2OTypeConverters.toListString()) ## # Getters ## def getFoldCol(self): return self.getOrDefault(self.foldCol) def getWeightCol(self): return self.getOrDefault(self.weightCol) def getSplitRatio(self): return self.getOrDefault(self.splitRatio) def getSeed(self): return self.getOrDefault(self.seed) def getNfolds(self): return self.getOrDefault(self.nfolds) def getAllStringColumnsToCategorical(self): return self.getOrDefault(self.allStringColumnsToCategorical) def getColumnsToCategorical(self): return self.getOrDefault(self.columnsToCategorical) ## # Setters ## def setFoldCol(self, value): return self._set(foldCol=value) def setWeightCol(self, value): return self._set(weightCol=value) def setSplitRatio(self, value): return self._set(splitRatio=value) def setSeed(self, value): return self._set(seed=value) def setNfolds(self, value): return self._set(nfolds=value) def setAllStringColumnsToCategorical(self, value): return self._set(allStringColumnsToCategorical=value) def setColumnsToCategorical(self, value, *args): assert_is_type(value, [str], str) if isinstance(value, str): prepared_array = [value] else: prepared_array = value for arg in args: prepared_array.append(arg) return self._set(columnsToCategorical=value) # Setters for parameters which are defined on MOJO as well def setPredictionCol(self, value): return self._set(predictionCol=value) def setDetailedPredictionCol(self, value): return self._set(detailedPredictionCol=value) def setWithDetailedPredictionCol(self, value): return self._set(withDetailedPredictionCol=value) def setFeaturesCols(self, value): return self._set(featuresCols=value) def setConvertUnknownCategoricalLevelsToNa(self, value): return self._set(convertUnknownCategoricalLevelsToNa=value) def setConvertInvalidNumbersToNa(self, value): return self._set(convertInvalidNumbersToNa=value) def setNamedMojoOutputColumns(self, value): return self._set(namedMojoOutputColumns=value)
class H2OMOJOAlgoSharedParams(Params): predictionCol = Param( Params._dummy(), "predictionCol", "Prediction column name", H2OTypeConverters.toString()) detailedPredictionCol = Param( Params._dummy(), "detailedPredictionCol", "Column containing additional prediction details, its content depends on the model type.", H2OTypeConverters.toString()) withDetailedPredictionCol = Param( Params._dummy(), "withDetailedPredictionCol", "Enables or disables generating additional prediction column, but with more details", H2OTypeConverters.toBoolean()) featuresCols = Param( Params._dummy(), "featuresCols", "Name of feature columns", H2OTypeConverters.toListString()) convertUnknownCategoricalLevelsToNa = Param( Params._dummy(), "convertUnknownCategoricalLevelsToNa", "If set to 'true', the model converts unknown categorical levels to NA during making predictions.", H2OTypeConverters.toBoolean()) convertInvalidNumbersToNa = Param( Params._dummy(), "convertInvalidNumbersToNa", "If set to 'true', the model converts invalid numbers to NA during making predictions.", H2OTypeConverters.toBoolean()) namedMojoOutputColumns = Param( Params._dummy(), "namedMojoOutputColumns", "Mojo Output is not stored in the array but in the properly named columns", H2OTypeConverters.toBoolean()) ## # Getters ## def getPredictionCol(self): return self.getOrDefault(self.predictionCol) def getDetailedPredictionCol(self): return self.getOrDefault(self.detailedPredictionCol) def getWithDetailedPredictionCol(self): return self.getOrDefault(self.withDetailedPredictionCol) def getFeaturesCols(self): return self.getOrDefault(self.featuresCols) def getConvertUnknownCategoricalLevelsToNa(self): return self.getOrDefault(self.convertUnknownCategoricalLevelsToNa) def getConvertInvalidNumbersToNa(self): return self.getOrDefault(self.convertInvalidNumbersToNa) def getNamedMojoOutputColumns(self): return self.getOrDefault(self.namedMojoOutputColumns)
class H2OTargetEncoderParams(Params): ## # Param definitions ## foldCol = Param(Params._dummy(), "foldCol", "Fold column name", H2OTypeConverters.toNullableString()) labelCol = Param(Params._dummy(), "labelCol", "Label column name", H2OTypeConverters.toString()) inputCols = Param(Params._dummy(), "inputCols", "Names of columns that will be transformed", H2OTypeConverters.toListString()) outputCols = Param( Params._dummy(), "outputCols", "Names of columns representing the result of target encoding", H2OTypeConverters.toListString()) holdoutStrategy = Param( Params._dummy(), "holdoutStrategy", """A strategy deciding what records will be excluded when calculating the target average on the training dataset. Options: None - All rows are considered for the calculation LeaveOneOut - All rows except the row the calculation is made for KFold - Only out-of-fold data is considered (The option requires foldCol to be set.""", H2OTypeConverters.toEnumString( "ai.h2o.targetencoding.TargetEncoder$DataLeakageHandlingStrategy")) blendedAvgEnabled = Param( Params._dummy(), "blendedAvgEnabled", "If set, the target average becomes a weighted average of the posterior average for a given " "categorical level and the prior average of the target. The weight is determined by the size " "of the given group that the row belongs to. By default, the blended average is disabled.", H2OTypeConverters.toBoolean()) blendedAvgInflectionPoint = Param( Params._dummy(), "blendedAvgInflectionPoint", "A parameter of the blended average. The bigger number is set, the groups relatively bigger to the " "overall data set size will consider the global target value as a component in the weighted average. " "The default value is 10." "", H2OTypeConverters.toFloat()) blendedAvgSmoothing = Param( Params._dummy(), "blendedAvgSmoothing", "A parameter of blended average. Controls the rate of transition between a group target value " "and a global target value. The default value is 20.", H2OTypeConverters.toFloat()) noise = Param( Params._dummy(), "noise", "Amount of random noise added to output values. The default value is 0.01", H2OTypeConverters.toFloat()) noiseSeed = Param(Params._dummy(), "noiseSeed", "A seed of the generator producing the random noise", H2OTypeConverters.toInt()) ## # Getters ## def getFoldCol(self): return self.getOrDefault(self.foldCol) def getLabelCol(self): return self.getOrDefault(self.labelCol) def getInputCols(self): return self.getOrDefault(self.inputCols) def getOutputCols(self): columns = self.getOrDefault(self.outputCols) if not columns: return list(map(lambda c: c + "_te", self.getInputCols())) else: return columns def getHoldoutStrategy(self): return self.getOrDefault(self.holdoutStrategy) def getBlendedAvgEnabled(self): return self.getOrDefault(self.blendedAvgEnabled) def getBlendedAvgInflectionPoint(self): return self.getOrDefault(self.blendedAvgInflectionPoint) def getBlendedAvgSmoothing(self): return self.getOrDefault(self.blendedAvgSmoothing) def getNoise(self): return self.getOrDefault(self.noise) def getNoiseSeed(self): return self.getOrDefault(self.noiseSeed)
class H2OBaseMOJOParams(Params): predictionCol = Param( Params._dummy(), "predictionCol", "Prediction column name", H2OTypeConverters.toString()) detailedPredictionCol = Param( Params._dummy(), "detailedPredictionCol", "Column containing additional prediction details, its content depends on the model type.", H2OTypeConverters.toString()) withDetailedPredictionCol = Param( Params._dummy(), "withDetailedPredictionCol", "Enables or disables generating additional prediction column, but with more details", H2OTypeConverters.toBoolean()) withContributions = Param( Params._dummy(), "withContributions", "Enables or disables generating a sub-column of detailedPredictionCol containing Shapley values.", H2OTypeConverters.toBoolean()) featuresCols = Param( Params._dummy(), "featuresCols", "Name of feature columns", H2OTypeConverters.toListString()) convertUnknownCategoricalLevelsToNa = Param( Params._dummy(), "convertUnknownCategoricalLevelsToNa", "If set to 'true', the model converts unknown categorical levels to NA during making predictions.", H2OTypeConverters.toBoolean()) convertInvalidNumbersToNa = Param( Params._dummy(), "convertInvalidNumbersToNa", "If set to 'true', the model converts invalid numbers to NA during making predictions.", H2OTypeConverters.toBoolean()) namedMojoOutputColumns = Param( Params._dummy(), "namedMojoOutputColumns", "Mojo Output is not stored in the array but in the properly named columns", H2OTypeConverters.toBoolean()) withLeafNodeAssignments = Param( Params._dummy(), "withLeafNodeAssignments", "Enables or disables computation of leaf node assignments.", H2OTypeConverters.toBoolean()) withStageResults = Param( Params._dummy(), "withStageResults", "Enables or disables computation of stage results.", H2OTypeConverters.toBoolean()) ## # Getters ## def getPredictionCol(self): return self.getOrDefault(self.predictionCol) def getDetailedPredictionCol(self): return self.getOrDefault(self.detailedPredictionCol) def getWithDetailedPredictionCol(self): warnings.warn("The method will be removed without a replacement in the version 3.34." "Detailed prediction columns is always enabled.", DeprecationWarning) return True def getWithContributions(self): return self.getOrDefault(self.withContributions) def getFeaturesCols(self): return self.getOrDefault(self.featuresCols) def getConvertUnknownCategoricalLevelsToNa(self): return self.getOrDefault(self.convertUnknownCategoricalLevelsToNa) def getConvertInvalidNumbersToNa(self): return self.getOrDefault(self.convertInvalidNumbersToNa) def getNamedMojoOutputColumns(self): return self.getOrDefault(self.namedMojoOutputColumns) def getWithLeafNodeAssignments(self): return self.getOrDefault(self.withLeafNodeAssignments) def getWithStageResults(self): return self.getOrDefault(self.withStageResults)
class H2OCommonParams(H2OBaseMOJOParams): ## # Param definitions ## splitRatio = Param( Params._dummy(), "splitRatio", "Accepts values in range [0, 1.0] which determine how large part of dataset is used for training" " and for validation. For example, 0.8 -> 80% training 20% validation.", H2OTypeConverters.toFloat()) columnsToCategorical = Param( Params._dummy(), "columnsToCategorical", "List of columns to convert to categorical before modelling", H2OTypeConverters.toListString()) ## # Getters ## def getSplitRatio(self): return self.getOrDefault(self.splitRatio) def getColumnsToCategorical(self): return self.getOrDefault(self.columnsToCategorical) ## # Setters ## def setSplitRatio(self, value): return self._set(splitRatio=value) def setColumnsToCategorical(self, value, *args): assert_is_type(value, [str], str) if isinstance(value, str): prepared_array = [value] else: prepared_array = value for arg in args: prepared_array.append(arg) return self._set(columnsToCategorical=value) # Setters for parameters which are defined on MOJO as well def setPredictionCol(self, value): return self._set(predictionCol=value) def setDetailedPredictionCol(self, value): return self._set(detailedPredictionCol=value) def setWithDetailedPredictionCol(self, value): return self._set(withDetailedPredictionCol=value) def setFeaturesCols(self, value): return self._set(featuresCols=value) def setConvertUnknownCategoricalLevelsToNa(self, value): return self._set(convertUnknownCategoricalLevelsToNa=value) def setConvertInvalidNumbersToNa(self, value): return self._set(convertInvalidNumbersToNa=value) def setNamedMojoOutputColumns(self, value): return self._set(namedMojoOutputColumns=value) def setWithContributions(self, value): return self._set(withContributions=value)