コード例 #1
0
ファイル: stackedensemble.py プロジェクト: udapy/h2o-3
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              blending_frame=None,
              **kwargs):
        has_training_frame = training_frame is not None or self.training_frame is not None
        blending_frame = H2OFrame._validate(blending_frame,
                                            'blending_frame',
                                            required=not has_training_frame)

        if not has_training_frame:
            training_frame = blending_frame  # used to bypass default checks in super class and backend and to guarantee default metrics

        def extend_parms(parms):
            if blending_frame is not None:
                parms['blending_frame'] = blending_frame
            if self.metalearner_fold_column is not None:
                parms['ignored_columns'].remove(
                    quoted(self.metalearner_fold_column))

        super(self.__class__, self)._train(x,
                                           y,
                                           training_frame,
                                           extend_parms_fn=extend_parms,
                                           **kwargs)
コード例 #2
0
    def train(self, x=None, y=None, training_frame=None, blending_frame=None, **kwargs):
        blending_frame = H2OFrame._validate(blending_frame, 'blending_frame', required=False)

        def extend_parms(parms):
            if blending_frame is not None:
                parms['blending_frame'] = blending_frame
            if self.metalearner_fold_column is not None:
                parms['ignored_columns'].remove(quoted(self.metalearner_fold_column))

        super(self.__class__, self)._train(x, y, training_frame,
                                           extend_parms_fn=extend_parms,
                                           **kwargs)
コード例 #3
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              blending_frame=None,
              **kwargs):
        blending_frame = H2OFrame._validate(blending_frame,
                                            'blending_frame',
                                            required=False)

        def extend_parms(parms):
            if blending_frame is not None:
                parms['blending_frame'] = blending_frame

        super(self.__class__, self)._train(x,
                                           y,
                                           training_frame,
                                           extend_parms_fn=extend_parms,
                                           **kwargs)
コード例 #4
0
ファイル: stackedensemble.py プロジェクト: zoudongyang/h2o-3
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              blending_frame=None,
              verbose=False,
              **kwargs):
        has_training_frame = training_frame is not None or self.training_frame is not None
        blending_frame = H2OFrame._validate(blending_frame,
                                            'blending_frame',
                                            required=not has_training_frame)

        if not has_training_frame:
            training_frame = blending_frame  # used to bypass default checks in super class and backend and to guarantee default metrics

        sup = super(self.__class__, self)

        def extend_parms(parms):
            if blending_frame is not None:
                parms['blending_frame'] = blending_frame
            if self.metalearner_fold_column is not None:
                parms['ignored_columns'].remove(
                    quoted(self.metalearner_fold_column))

        parms = sup._make_parms(x,
                                y,
                                training_frame,
                                extend_parms_fn=extend_parms,
                                **kwargs)

        sup._train(parms, verbose=verbose)
        if self.metalearner() is None:
            raise H2OResponseError(
                "Meta learner didn't get to be trained in time. "
                "Try increasing max_runtime_secs or setting it to 0 (unlimited)."
            )
コード例 #5
0
ファイル: stackedensemble.py プロジェクト: zoudongyang/h2o-3
 def blending_frame(self, blending_frame):
     self._parms["blending_frame"] = H2OFrame._validate(
         blending_frame, 'blending_frame')
コード例 #6
0
ファイル: kmeans.py プロジェクト: kakustham/h2o-3
 def user_points(self, user_points):
     self._parms["user_points"] = H2OFrame._validate(
         user_points, 'user_points')
コード例 #7
0
 def beta_constraints(self, beta_constraints):
     self._parms["beta_constraints"] = H2OFrame._validate(beta_constraints, 'beta_constraints')
コード例 #8
0
 def plug_values(self, plug_values):
     self._parms["plug_values"] = H2OFrame._validate(plug_values, 'plug_values')
コード例 #9
0
 def model_key(self, model_key):
     self._parms["model_key"] = H2OFrame._validate(model_key, 'model_key')
コード例 #10
0
    def train(self, x=None, y=None, training_frame=None, fold_column=None,
              weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold_column or weights_column).
        :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets 
            nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used 
            for early stopping of individual models and early stopping of the grid searches.  By default and 
            when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored.
        :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard.  This is optional and
            if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard 
            rankings instead.
        :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values).
            This is optional, but when provided, it is also recommended to disable cross validation 
            by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes.

        :returns: An H2OAutoML object.

        :examples:
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        """
        training_frame = H2OFrame._validate(training_frame, 'training_frame', required=True)
        ncols = training_frame.ncols
        names = training_frame.names

        # Minimal required arguments are training_frame and y (response)
        if y is None:
            raise H2OValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.')
        else:
            assert_is_type(y,int,str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            input_spec = {
                'response_column': y,
            }

        input_spec['training_frame'] = training_frame.frame_id

        if fold_column is not None:
            assert_is_type(fold_column,int,str)
            input_spec['fold_column'] = fold_column

        if weights_column is not None:
            assert_is_type(weights_column,int,str)
            input_spec['weights_column'] = weights_column

        if validation_frame is not None:
            validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
            input_spec['validation_frame'] = validation_frame.frame_id

        if leaderboard_frame is not None:
            leaderboard_frame = H2OFrame._validate(leaderboard_frame, 'leaderboard_frame')
            input_spec['leaderboard_frame'] = leaderboard_frame.frame_id

        if blending_frame is not None:
            blending_frame = H2OFrame._validate(blending_frame, 'blending_frame')
            input_spec['blending_frame'] = blending_frame.frame_id

        if self.sort_metric is not None:
            assert_is_type(self.sort_metric, str)
            sort_metric = self.sort_metric.lower()
            # Changed the API to use "deviance" to be consistent with stopping_metric values
            # TO DO: let's change the backend to use "deviance" since we use the term "deviance"
            # After that we can take this `if` statement out
            if sort_metric == "deviance":
                sort_metric = "mean_residual_deviance"
            input_spec['sort_metric'] = sort_metric

        if x is not None:
            assert_is_type(x,list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
            x = list(xset)
            ignored_columns = set(names) - {y} - set(x)
            if fold_column is not None and fold_column in ignored_columns:
                ignored_columns.remove(fold_column)
            if weights_column is not None and weights_column in ignored_columns:
                ignored_columns.remove(weights_column)
            if ignored_columns is not None:
                input_spec['ignored_columns'] = list(ignored_columns)

        automl_build_params = dict(input_spec=input_spec)

        # NOTE: if the user hasn't specified some block of parameters don't send them!
        # This lets the back end use the defaults.
        automl_build_params['build_control'] = self.build_control
        automl_build_params['build_models'] = self.build_models

        resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params)
        if 'job' not in resp:
            print("Exception from the back end: ")
            print(resp)
            return

        if not self.project_name:
            self.build_control['project_name'] = self.project_name = resp['build_control']['project_name']

        self._job = H2OJob(resp['job'], "AutoML")
        poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={})
        try:
            self._job.poll(poll_updates=poll_updates)
        finally:
            poll_updates(self._job, 1)

        self._fetch()
コード例 #11
0
ファイル: word2vec.py プロジェクト: Shysoong/air-3
 def pre_trained(self, pre_trained):
     self._parms["pre_trained"] = H2OFrame._validate(
         pre_trained, 'pre_trained')
コード例 #12
0
 def training_frame(self, training_frame):
     self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')
コード例 #13
0
 def __validate_frame(self, fr, name=None, required=False):
     return H2OFrame._validate(fr, name, required=required)
コード例 #14
0
ファイル: glrm.py プロジェクト: timgates42/h2o-3
 def user_x(self, user_x):
     self._parms["user_x"] = H2OFrame._validate(user_x, 'user_x')
コード例 #15
0
ファイル: glrm.py プロジェクト: timgates42/h2o-3
 def user_y(self, user_y):
     self._parms["user_y"] = H2OFrame._validate(user_y, 'user_y')
コード例 #16
0
 def pre_trained(self, pre_trained):
     pt = self._parms["pre_trained"] = H2OFrame._validate(
         pre_trained, 'pre_trained')
     if pt is not None:
         self.vec_size = H2OWord2vecEstimator._determine_vec_size(pt)
コード例 #17
0
 def _make_parms(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
                 weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
                 model_id=None, verbose=False, extend_parms_fn=None):
     has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None
     training_frame = H2OFrame._validate(training_frame, 'training_frame',
                                         required=self._options_.get('requires_training_frame', True) and not has_default_training_frame)
     validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
     assert_is_type(y, None, int, str)
     assert_is_type(x, None, int, str, [str, int], {str, int})
     assert_is_type(ignored_columns, None, [str, int], {str, int})
     assert_is_type(offset_column, None, int, str)
     assert_is_type(fold_column, None, int, str)
     assert_is_type(weights_column, None, int, str)
     assert_is_type(max_runtime_secs, None, numeric)
     assert_is_type(model_id, None, str)
     assert_is_type(verbose, bool)
     assert_is_type(extend_parms_fn, None, FunctionType)
 
     override_default_training_frame = training_frame is not None
     if not override_default_training_frame:
         self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)
         training_frame = self.training_frame if has_default_training_frame else None
 
     if verbose and not self._options_.get('verbose', False):
         raise H2OValueError("Verbose mode is not available for %s" % self.__class__.__name__)
     parms = self._parms.copy()
     names = training_frame.names if training_frame is not None else []
     ncols = training_frame.ncols if training_frame is not None else 0
     types = training_frame.types if training_frame is not None else {}
 
     if self.supervised_learning:
         if y is None: y = "response"
         if is_type(y, int):
             if not (-ncols <= y < ncols):
                 raise H2OValueError("Column %d does not exist in the training frame" % y)
             y = names[y]
         else:
             if y not in names:
                 raise H2OValueError("Column %s does not exist in the training frame" % y)
         self._estimator_type = "classifier" if types[y] == "enum" else "regressor"
     else:
         # If `y` is provided for an unsupervised model we'll simply ignore
         # it. This way an unsupervised model can be used as a step in
         # sklearn's pipeline.
         y = None
         self._estimator_type = "unsupervised"
 
     if override_default_training_frame:
         assert_is_type(y, str, None)
         ignored_columns_set = set()
         if ignored_columns is None and "ignored_columns" in parms:
             ignored_columns = parms['ignored_columns']
         if ignored_columns is not None:
             if x is not None:
                 raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
             for ic in ignored_columns:
                 if is_type(ic, int):
                     if not (-ncols <= ic < ncols):
                         raise H2OValueError("Column %d does not exist in the training frame" % ic)
                     ignored_columns_set.add(names[ic])
                 else:
                     if ic not in names:
                         raise H2OValueError("Column %s not in the training frame" % ic)
                     ignored_columns_set.add(ic)
         if x is None:
             xset = set(names) - {y} - ignored_columns_set
         else:
             xset = set()
             if is_type(x, int, str): x = [x]
             for xi in x:
                 if is_type(xi, int):
                     if not (-ncols <= xi < ncols):
                         raise H2OValueError("Column %d does not exist in the training frame" % xi)
                     xset.add(names[xi])
                 else:
                     if xi not in names:
                         raise H2OValueError("Column %s not in the training frame" % xi)
                     xset.add(xi)
         x = list(xset)
         self._check_and_save_parm(parms, "offset_column", offset_column)
         self._check_and_save_parm(parms, "weights_column", weights_column)
         self._check_and_save_parm(parms, "fold_column", fold_column)
 
     if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs
 
     # Overwrites the model_id parameter only if model_id is passed
     if model_id is not None:
         parms["model_id"] = model_id
     if override_default_training_frame:
         parms["training_frame"] = training_frame
         offset = parms["offset_column"]
         folds = parms["fold_column"]
         weights = parms["weights_column"]
 
     if validation_frame is not None:
         parms["validation_frame"] = validation_frame
 
     if is_type(y, int):
         y = names[y]
     if y is not None:
         parms["response_column"] = y
     if not isinstance(x, (list, tuple)):
         x = [x]
     if len(x) > 0 and is_type(x[0], int):
         x = [names[i] for i in x]
     if override_default_training_frame:
         ignored_columns = list(set(names) - set(x + [y, offset, folds, weights]))
         parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
     parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None
                              else [quoted(col) for col in parms["interactions"]])
     parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None
                                   else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])
 
     # internal hook allowing subclasses to extend train parms 
     if extend_parms_fn is not None:
         extend_parms_fn(parms)
 
     parms = {k: H2OEstimator._keyify(v) for k, v in parms.items()}
     if "r2" in (parms.get('stopping_metric') or []):
         raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
     return parms
コード例 #18
0
ファイル: estimator_base.py プロジェクト: lizhaodong/h2o-3
    def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False, extend_parms_fn=None):
        has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None
        training_frame = H2OFrame._validate(training_frame, 'training_frame',
                                            required=self._requires_training_frame() and not has_default_training_frame)
        validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)
        assert_is_type(extend_parms_fn, None, FunctionType)

        override_default_training_frame = training_frame is not None
        if not override_default_training_frame:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)
            training_frame = self.training_frame if has_default_training_frame else None

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if algo=="pca" and "k" not in parms.keys():
            parms["k"] = 1
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"})

        names = training_frame.names if training_frame is not None else []
        ncols = training_frame.ncols if training_frame is not None else 0
        types = training_frame.types if training_frame is not None else {}

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if override_default_training_frame:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is None and "ignored_columns" in parms:
                ignored_columns = parms['ignored_columns']
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)
            self._check_and_save_parm(parms, "offset_column", offset_column)
            self._check_and_save_parm(parms, "weights_column", weights_column)
            self._check_and_save_parm(parms, "fold_column", fold_column)

        if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"}
        if is_auto_encoder and y is not None:
            raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None and self.algo not in ["generic"]:
            raise ValueError("Missing response")

        # Step 3
        if override_default_training_frame:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None:
            parms["validation_frame"] = validation_frame

        if is_type(y, int):
            y = names[y]
        if y is not None:
            parms["response_column"] = y
        if not isinstance(x, (list, tuple)):
            x = [x]
        if is_type(x[0], int):
            x = [names[i] for i in x]
        if override_default_training_frame:
            ignored_columns = list(set(names) - set(x + [y, offset, folds, weights] + self._additional_used_columns(parms)))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None
                                 else [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None
                                      else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])
    
        # internal hook allowing subclasses to extend train parms 
        if extend_parms_fn is not None:
            extend_parms_fn(parms)
            
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]):
            raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(poll_updates=self._print_model_scoring_history if verbose else None)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
コード例 #19
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              fold_column=None,
              weights_column=None,
              validation_frame=None,
              leaderboard_frame=None,
              blending_frame=None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold_column or weights_column).
        :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets 
            nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used 
            for early stopping of individual models and early stopping of the grid searches.  By default and 
            when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored.
        :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard.  This is optional and
            if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard 
            rankings instead.
        :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values).
            This is optional, but when provided, it is also recommended to disable cross validation 
            by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes.

        :returns: An H2OAutoML object.

        :examples:
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        """
        # Minimal required arguments are training_frame and y (response)
        training_frame = H2OFrame._validate(training_frame,
                                            'training_frame',
                                            required=True)
        self.input_spec['training_frame'] = training_frame.frame_id

        ncols = training_frame.ncols
        names = training_frame.names

        if y is None:
            raise H2OValueError(
                'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.'
            )
        else:
            assert_is_type(y, int, str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError(
                        "Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError(
                        "Column %s does not exist in the training frame" % y)
            self.input_spec['response_column'] = y

        assert_is_type(fold_column, None, int, str)
        self.input_spec['fold_column'] = fold_column

        assert_is_type(weights_column, None, int, str)
        self.input_spec['weights_column'] = weights_column

        validation_frame = H2OFrame._validate(validation_frame,
                                              'validation_frame')
        self.input_spec[
            'validation_frame'] = validation_frame.frame_id if validation_frame is not None else None

        leaderboard_frame = H2OFrame._validate(leaderboard_frame,
                                               'leaderboard_frame')
        self.input_spec[
            'leaderboard_frame'] = leaderboard_frame.frame_id if leaderboard_frame is not None else None

        blending_frame = H2OFrame._validate(blending_frame, 'blending_frame')
        self.input_spec[
            'blending_frame'] = blending_frame.frame_id if blending_frame is not None else None

        if x is not None:
            assert_is_type(x, list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError(
                            "Column %d does not exist in the training frame" %
                            xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError(
                            "Column %s not in the training frame" % xi)
                    xset.add(xi)
            ignored_columns = set(names) - xset
            for col in [y, fold_column, weights_column]:
                if col is not None and col in ignored_columns:
                    ignored_columns.remove(col)
            if ignored_columns is not None:
                self.input_spec['ignored_columns'] = list(ignored_columns)

        def clean_params(params):
            return {
                k: clean_params(v)
                for k, v in params.items() if v is not None
            } if isinstance(params, dict) else params

        automl_build_params = clean_params(
            dict(
                build_control=self.build_control,
                build_models=self.build_models,
                input_spec=self.input_spec,
            ))

        resp = self._build_resp = h2o.api('POST /99/AutoMLBuilder',
                                          json=automl_build_params)
        if 'job' not in resp:
            raise H2OResponseError(
                "Backend failed to build the AutoML job: {}".format(resp))

        if not self.project_name:
            self.project_name = self.build_control['project_name'] = resp[
                'build_control']['project_name']

        self._job = H2OJob(resp['job'], "AutoML")
        poll_updates = ft.partial(self._poll_training_updates,
                                  verbosity=self._verbosity,
                                  state={})
        try:
            self._job.poll(poll_updates=poll_updates)
        finally:
            poll_updates(self._job, 1)

        self._fetch()
コード例 #20
0
 def validation_frame(self, validation_frame):
     self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')
コード例 #21
0
 def calibration_frame(self, calibration_frame):
     self._parms["calibration_frame"] = H2OFrame._validate(
         calibration_frame, 'calibration_frame')