Exemple #1
0
    def transform(self, frame=None, holdout_type=None, noise=-1, seed=-1):
        """
        Apply transformation to `te_columns` based on the encoding maps generated during `TargetEncoder.fit()` call.
        You must not pass encodings manually from `.fit()` method because they are being stored internally
        after `.fit()' had been called.

        :param frame frame: to which frame we are applying target encoding transformations.
        :param str holdout_type: Supported options:

                1) "kfold" - encodings for a fold are generated based on out-of-fold data.
                2) "loo" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies.
                3) "none" - we do not holdout anything. Using whole frame for training
                
        :param float noise: the amount of random noise added to the target encoding.  This helps prevent overfitting. Defaults to 0.01 * range of y.
        :param int seed: a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1.
        """
        assert_is_type(holdout_type, "kfold", "loo", "none")

        # We need to make sure that frames are being sent in the same order
        assert self._encodingMap.map_keys['string'] == self._teColumns
        encodingMapKeys = self._encodingMap.map_keys['string']
        encodingMapFramesKeys = list(
            map(lambda x: x['key']['name'], self._encodingMap.frames))
        return H2OFrame._expr(expr=ExprNode(
            "target.encoder.transform", encodingMapKeys, encodingMapFramesKeys,
            frame, self._teColumns, holdout_type, self._responseColumnName,
            self._foldColumnName, self._blending, self._inflectionPoint,
            self._smoothing, noise, seed))
Exemple #2
0
    def transform(self, frame=None, holdout_type=None, noise=-1, seed=-1):
        """
        Apply transformation to `te_columns` based on the encoding maps generated during `TargetEncoder.fit()` call.
        You must not pass encodings manually from `.fit()` method because they are being stored internally
        after `.fit()' had been called.

        :param frame frame: to which frame we are applying target encoding transformations.
        :param str holdout_type: Supported options:

                1) "kfold" - encodings for a fold are generated based on out-of-fold data.
                2) "loo" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies.
                3) "none" - we do not holdout anything. Using whole frame for training
                
        :param float noise: the amount of random noise added to the target encoding.  This helps prevent overfitting. Defaults to 0.01 * range of y.
        :param int seed: a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1.
        """
        assert_is_type(holdout_type, "kfold", "loo", "none")

        # We need to make sure that frames are being sent in the same order
        assert self._encodingMap.map_keys['string'] == self._teColumns
        encodingMapKeys = self._encodingMap.map_keys['string']
        encodingMapFramesKeys = list(map(lambda x: x['key']['name'], self._encodingMap.frames))
        return H2OFrame._expr(expr=ExprNode("target.encoder.transform", encodingMapKeys, encodingMapFramesKeys, frame, self._teColumns, holdout_type,
                                            self._responseColumnName, self._foldColumnName,
                                            self._blending, self._inflectionPoint, self._smoothing,
                                            noise, seed))
Exemple #3
0
    def transform(self, is_train_or_valid, frame = None, holdout_type = None, noise = -1, seed = -1):
        """
        Apply transformation to `te_columns` based on the encoding maps generated during `TargetEncoder.fit()` call.
        You must not pass encodings manually from `.fit()` method because they are being stored internally
        after `.fit()' had been called.

        :param bool is_train_or_valid: explicitly specify type of the data.
        :param frame frame: to which frame we are applying target encoding transformations.
        :param str holdout_type:
            Supported options:
                1) "kfold" - encodings for a fold are generated based on out-of-fold data.
                2) "loo" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies.
                3) "none" - we do not holdout anything. Using whole frame for training
        :param float noise: amount of noise to add to the final target encodings.
        :param int seed: set to fixed value for reproducibility.
        """
        assert_is_type(holdout_type, "kfold", "loo", "none")

        # We need to make sure that frames are being sent in the same order
        assert self._encodingMap.map_keys['string'] == self._teColumns
        encodingMapKeys = self._encodingMap.map_keys['string']
        encodingMapFramesKeys = list(map(lambda x: x['key']['name'], self._encodingMap.frames))
        return H2OFrame._expr(expr=ExprNode("target.encoder.transform", encodingMapKeys, encodingMapFramesKeys, frame, self._teColumns, holdout_type,
                                            self._responseColumnName, self._foldColumnName,
                                            self._blending, self._inflectionPoint, self._smoothing,
                                            noise, seed, is_train_or_valid))
Exemple #4
0
 def result(self):
     """
     Get result frame that contains information about the model building process like for modelselection and anovaglm.
     :return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm.
     """
     return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(
         fill_cache=True)
Exemple #5
0
    def as_frame(self):
        """
        Converts this collection of models to a tabular representation.

        :returns: An H2OFrame, first columns identify the input segments, rest of the columns describe the built models. 
        """
        return H2OFrame._expr(expr=ExprNode("segment_models_as_frame", ASTId(self._segment_models_id)))._frame(fill_cache=True)
Exemple #6
0
 def transform_frame(self, fr):
     """
     GLRM performs A=X*Y during training.  When a new dataset is given, GLRM will perform Anew = Xnew*Y.  When
     predict is called, Xnew*Y is returned.  When transform_frame is called, Xnew is returned instead.
     :return: an H2OFrame that contains Xnew.
     """
     return H2OFrame._expr(
         expr=ExprNode("transform", ASTId(self.key), ASTId(fr.key)))._frame(
             fill_cache=True)
Exemple #7
0
    def predict_rules(self, frame, rule_ids):
        """
        Evaluates validity of the given rules on the given data. 

        :param frame: H2OFrame on which rule validity is to be evaluated
        :param rule_ids: string array of rule ids to be evaluated against the frame
        :return: H2OFrame with a column per each input ruleId, representing a flag whether given rule is applied to the observation or not.
        """
        from h2o.frame import H2OFrame
        from h2o.utils.typechecks import assert_is_type
        from h2o.expr import ExprNode
        assert_is_type(frame, H2OFrame)
        return H2OFrame._expr(expr=ExprNode("rulefit.predict.rules", self, frame, rule_ids))
Exemple #8
0
        def rbind(*data):
            slf = data[0]
            nrow_sum = 0

            for frame in data:
                if frame.ncol != slf.ncol:
                    raise ValueError("Cannot row-bind a dataframe with %d columns to a data frame with %d columns: "
                                     "the columns must match" % (frame.ncol, slf.ncol))
                if frame.columns != slf.columns or frame.types != slf.types:
                    raise ValueError("Column names and types must match for rbind() to work")
                nrow_sum += frame.nrow

            fr = H2OFrame._expr(expr=ExprNode("rbind", slf, *data[1:]), cache=slf._ex._cache)
            fr._ex._cache.nrows = nrow_sum
            return fr
Exemple #9
0
    def transform(self, data, allow_timestamps=False):
        """
        Transform H2OFrame using a MOJO Pipeline.

        :param data: Frame to be transformed.
        :param allow_timestamps: Allows datetime columns to be used directly with MOJO pipelines. It is recommended
        to parse your datetime columns as Strings when using pipelines because pipelines can interpret certain datetime
        formats in a different way. If your H2OFrame is parsed from a binary file format (eg. Parquet) instead of CSV
        it is safe to turn this option on and use datetime columns directly.

        :returns: A new H2OFrame.
        """
        assert_is_type(data, H2OFrame)
        assert_is_type(allow_timestamps, bool)
        return H2OFrame._expr(ExprNode("mojo.pipeline.transform", self.pipeline_id[0], data, allow_timestamps))
Exemple #10
0
        def rbind(*data):
            slf = data[0]
            nrow_sum = 0

            for frame in data:
                if frame.ncol != slf.ncol:
                    raise ValueError(
                        "Cannot row-bind a dataframe with %d columns to a data frame with %d columns: "
                        "the columns must match" % (frame.ncol, slf.ncol))
                if frame.columns != slf.columns or frame.types != slf.types:
                    raise ValueError(
                        "Column names and types must match for rbind() to work"
                    )
                nrow_sum += frame.nrow

            fr = H2OFrame._expr(expr=ExprNode("rbind", slf, *data[1:]),
                                cache=slf._ex._cache)
            fr._ex._cache.nrows = nrow_sum
            return fr
Exemple #11
0
    def transform(self, frame=None, holdout_type=None, noise=-1, seed=-1):
        """
        Deprecated API. Please use H2OTargetencoderEstimator instead.
        
        Apply transformation to `te_columns` based on the encoding maps generated during `TargetEncoder.fit()` call.
        You must not pass encodings manually from `.fit()` method because they are being stored internally
        after `.fit()' had been called.

        :param frame frame: to which frame we are applying target encoding transformations.
        :param str holdout_type: Supported options:

                1) "kfold" - encodings for a fold are generated based on out-of-fold data.
                2) "loo" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies.
                3) "none" - we do not holdout anything. Using whole frame for training
                
        :param float noise: the amount of random noise added to the target encoding.  This helps prevent overfitting. Defaults to 0.01 * range of y.
        :param int seed: a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1.

        :example:
        >>> targetEncoder = TargetEncoder(x=te_columns, y=responseColumnName, blended_avg=True, inflection_point=10, smoothing=20)
        >>> encodedTrain = targetEncoder.transform(frame=trainFrame, holdout_type="kfold", seed=1234, is_train_or_valid=True)
        """
        assert_is_type(holdout_type, "kfold", "loo", "none")

        if holdout_type == "kfold" and self._foldColumnName == '':
            raise ValueError(
                "Attempt to use kfold strategy when encoding map was created without fold column being specified."
            )
        if holdout_type == "none" and noise != 0:
            warnings.warn(
                "Attempt to apply noise with holdout_type=`none` strategy",
                stacklevel=2)

        encodingMapKeys = self._encodingMap.map_keys['string']
        encodingMapFramesKeys = list(
            map(lambda x: x['key']['name'], self._encodingMap.frames))
        return H2OFrame._expr(expr=ExprNode(
            "target.encoder.transform", encodingMapKeys, encodingMapFramesKeys,
            frame, self._teColumns, holdout_type, self._responseColumnName,
            self._foldColumnName, self._blending, self._inflectionPoint,
            self._smoothing, noise, seed))