def train(self, x=None, y=None, training_frame=None, blending_frame=None, **kwargs): has_training_frame = training_frame is not None or self.training_frame is not None blending_frame = H2OFrame._validate(blending_frame, 'blending_frame', required=not has_training_frame) if not has_training_frame: training_frame = blending_frame # used to bypass default checks in super class and backend and to guarantee default metrics def extend_parms(parms): if blending_frame is not None: parms['blending_frame'] = blending_frame if self.metalearner_fold_column is not None: parms['ignored_columns'].remove( quoted(self.metalearner_fold_column)) super(self.__class__, self)._train(x, y, training_frame, extend_parms_fn=extend_parms, **kwargs)
def train(self, x=None, y=None, training_frame=None, blending_frame=None, **kwargs): blending_frame = H2OFrame._validate(blending_frame, 'blending_frame', required=False) def extend_parms(parms): if blending_frame is not None: parms['blending_frame'] = blending_frame if self.metalearner_fold_column is not None: parms['ignored_columns'].remove(quoted(self.metalearner_fold_column)) super(self.__class__, self)._train(x, y, training_frame, extend_parms_fn=extend_parms, **kwargs)
def train(self, x=None, y=None, training_frame=None, blending_frame=None, **kwargs): blending_frame = H2OFrame._validate(blending_frame, 'blending_frame', required=False) def extend_parms(parms): if blending_frame is not None: parms['blending_frame'] = blending_frame super(self.__class__, self)._train(x, y, training_frame, extend_parms_fn=extend_parms, **kwargs)
def train(self, x=None, y=None, training_frame=None, blending_frame=None, verbose=False, **kwargs): has_training_frame = training_frame is not None or self.training_frame is not None blending_frame = H2OFrame._validate(blending_frame, 'blending_frame', required=not has_training_frame) if not has_training_frame: training_frame = blending_frame # used to bypass default checks in super class and backend and to guarantee default metrics sup = super(self.__class__, self) def extend_parms(parms): if blending_frame is not None: parms['blending_frame'] = blending_frame if self.metalearner_fold_column is not None: parms['ignored_columns'].remove( quoted(self.metalearner_fold_column)) parms = sup._make_parms(x, y, training_frame, extend_parms_fn=extend_parms, **kwargs) sup._train(parms, verbose=verbose) if self.metalearner() is None: raise H2OResponseError( "Meta learner didn't get to be trained in time. " "Try increasing max_runtime_secs or setting it to 0 (unlimited)." )
def blending_frame(self, blending_frame): self._parms["blending_frame"] = H2OFrame._validate( blending_frame, 'blending_frame')
def user_points(self, user_points): self._parms["user_points"] = H2OFrame._validate( user_points, 'user_points')
def beta_constraints(self, beta_constraints): self._parms["beta_constraints"] = H2OFrame._validate(beta_constraints, 'beta_constraints')
def plug_values(self, plug_values): self._parms["plug_values"] = H2OFrame._validate(plug_values, 'plug_values')
def model_key(self, model_key): self._parms["model_key"] = H2OFrame._validate(model_key, 'model_key')
def train(self, x=None, y=None, training_frame=None, fold_column=None, weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold_column or weights_column). :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used for early stopping of individual models and early stopping of the grid searches. By default and when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored. :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard. This is optional and if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard rankings instead. :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values). This is optional, but when provided, it is also recommended to disable cross validation by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch an AutoML run >>> aml.train(y=y, training_frame=train) """ training_frame = H2OFrame._validate(training_frame, 'training_frame', required=True) ncols = training_frame.ncols names = training_frame.names # Minimal required arguments are training_frame and y (response) if y is None: raise H2OValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.') else: assert_is_type(y,int,str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) input_spec = { 'response_column': y, } input_spec['training_frame'] = training_frame.frame_id if fold_column is not None: assert_is_type(fold_column,int,str) input_spec['fold_column'] = fold_column if weights_column is not None: assert_is_type(weights_column,int,str) input_spec['weights_column'] = weights_column if validation_frame is not None: validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') input_spec['validation_frame'] = validation_frame.frame_id if leaderboard_frame is not None: leaderboard_frame = H2OFrame._validate(leaderboard_frame, 'leaderboard_frame') input_spec['leaderboard_frame'] = leaderboard_frame.frame_id if blending_frame is not None: blending_frame = H2OFrame._validate(blending_frame, 'blending_frame') input_spec['blending_frame'] = blending_frame.frame_id if self.sort_metric is not None: assert_is_type(self.sort_metric, str) sort_metric = self.sort_metric.lower() # Changed the API to use "deviance" to be consistent with stopping_metric values # TO DO: let's change the backend to use "deviance" since we use the term "deviance" # After that we can take this `if` statement out if sort_metric == "deviance": sort_metric = "mean_residual_deviance" input_spec['sort_metric'] = sort_metric if x is not None: assert_is_type(x,list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) ignored_columns = set(names) - {y} - set(x) if fold_column is not None and fold_column in ignored_columns: ignored_columns.remove(fold_column) if weights_column is not None and weights_column in ignored_columns: ignored_columns.remove(weights_column) if ignored_columns is not None: input_spec['ignored_columns'] = list(ignored_columns) automl_build_params = dict(input_spec=input_spec) # NOTE: if the user hasn't specified some block of parameters don't send them! # This lets the back end use the defaults. automl_build_params['build_control'] = self.build_control automl_build_params['build_models'] = self.build_models resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: print("Exception from the back end: ") print(resp) return if not self.project_name: self.build_control['project_name'] = self.project_name = resp['build_control']['project_name'] self._job = H2OJob(resp['job'], "AutoML") poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={}) try: self._job.poll(poll_updates=poll_updates) finally: poll_updates(self._job, 1) self._fetch()
def pre_trained(self, pre_trained): self._parms["pre_trained"] = H2OFrame._validate( pre_trained, 'pre_trained')
def training_frame(self, training_frame): self._parms["training_frame"] = H2OFrame._validate(training_frame, 'training_frame')
def __validate_frame(self, fr, name=None, required=False): return H2OFrame._validate(fr, name, required=required)
def user_x(self, user_x): self._parms["user_x"] = H2OFrame._validate(user_x, 'user_x')
def user_y(self, user_y): self._parms["user_y"] = H2OFrame._validate(user_y, 'user_y')
def pre_trained(self, pre_trained): pt = self._parms["pre_trained"] = H2OFrame._validate( pre_trained, 'pre_trained') if pt is not None: self.vec_size = H2OWord2vecEstimator._determine_vec_size(pt)
def _make_parms(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False, extend_parms_fn=None): has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None training_frame = H2OFrame._validate(training_frame, 'training_frame', required=self._options_.get('requires_training_frame', True) and not has_default_training_frame) validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) assert_is_type(extend_parms_fn, None, FunctionType) override_default_training_frame = training_frame is not None if not override_default_training_frame: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) training_frame = self.training_frame if has_default_training_frame else None if verbose and not self._options_.get('verbose', False): raise H2OValueError("Verbose mode is not available for %s" % self.__class__.__name__) parms = self._parms.copy() names = training_frame.names if training_frame is not None else [] ncols = training_frame.ncols if training_frame is not None else 0 types = training_frame.types if training_frame is not None else {} if self.supervised_learning: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None self._estimator_type = "unsupervised" if override_default_training_frame: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is None and "ignored_columns" in parms: ignored_columns = parms['ignored_columns'] if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) self._check_and_save_parm(parms, "offset_column", offset_column) self._check_and_save_parm(parms, "weights_column", weights_column) self._check_and_save_parm(parms, "fold_column", fold_column) if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id if override_default_training_frame: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if len(x) > 0 and is_type(x[0], int): x = [names[i] for i in x] if override_default_training_frame: ignored_columns = list(set(names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) # internal hook allowing subclasses to extend train parms if extend_parms_fn is not None: extend_parms_fn(parms) parms = {k: H2OEstimator._keyify(v) for k, v in parms.items()} if "r2" in (parms.get('stopping_metric') or []): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") return parms
def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False, extend_parms_fn=None): has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None training_frame = H2OFrame._validate(training_frame, 'training_frame', required=self._requires_training_frame() and not has_default_training_frame) validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) assert_is_type(extend_parms_fn, None, FunctionType) override_default_training_frame = training_frame is not None if not override_default_training_frame: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) training_frame = self.training_frame if has_default_training_frame else None algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if algo=="pca" and "k" not in parms.keys(): parms["k"] = 1 if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"}) names = training_frame.names if training_frame is not None else [] ncols = training_frame.ncols if training_frame is not None else 0 types = training_frame.types if training_frame is not None else {} if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if override_default_training_frame: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is None and "ignored_columns" in parms: ignored_columns = parms['ignored_columns'] if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) self._check_and_save_parm(parms, "offset_column", offset_column) self._check_and_save_parm(parms, "weights_column", weights_column) self._check_and_save_parm(parms, "fold_column", fold_column) if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None and self.algo not in ["generic"]: raise ValueError("Missing response") # Step 3 if override_default_training_frame: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [names[i] for i in x] if override_default_training_frame: ignored_columns = list(set(names) - set(x + [y, offset, folds, weights] + self._additional_used_columns(parms))) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) # internal hook allowing subclasses to extend train parms if extend_parms_fn is not None: extend_parms_fn(parms) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(poll_updates=self._print_model_scoring_history if verbose else None) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def train(self, x=None, y=None, training_frame=None, fold_column=None, weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold_column or weights_column). :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used for early stopping of individual models and early stopping of the grid searches. By default and when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored. :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard. This is optional and if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard rankings instead. :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values). This is optional, but when provided, it is also recommended to disable cross validation by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch an AutoML run >>> aml.train(y=y, training_frame=train) """ # Minimal required arguments are training_frame and y (response) training_frame = H2OFrame._validate(training_frame, 'training_frame', required=True) self.input_spec['training_frame'] = training_frame.frame_id ncols = training_frame.ncols names = training_frame.names if y is None: raise H2OValueError( 'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.' ) else: assert_is_type(y, int, str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError( "Column %s does not exist in the training frame" % y) self.input_spec['response_column'] = y assert_is_type(fold_column, None, int, str) self.input_spec['fold_column'] = fold_column assert_is_type(weights_column, None, int, str) self.input_spec['weights_column'] = weights_column validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') self.input_spec[ 'validation_frame'] = validation_frame.frame_id if validation_frame is not None else None leaderboard_frame = H2OFrame._validate(leaderboard_frame, 'leaderboard_frame') self.input_spec[ 'leaderboard_frame'] = leaderboard_frame.frame_id if leaderboard_frame is not None else None blending_frame = H2OFrame._validate(blending_frame, 'blending_frame') self.input_spec[ 'blending_frame'] = blending_frame.frame_id if blending_frame is not None else None if x is not None: assert_is_type(x, list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError( "Column %s not in the training frame" % xi) xset.add(xi) ignored_columns = set(names) - xset for col in [y, fold_column, weights_column]: if col is not None and col in ignored_columns: ignored_columns.remove(col) if ignored_columns is not None: self.input_spec['ignored_columns'] = list(ignored_columns) def clean_params(params): return { k: clean_params(v) for k, v in params.items() if v is not None } if isinstance(params, dict) else params automl_build_params = clean_params( dict( build_control=self.build_control, build_models=self.build_models, input_spec=self.input_spec, )) resp = self._build_resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: raise H2OResponseError( "Backend failed to build the AutoML job: {}".format(resp)) if not self.project_name: self.project_name = self.build_control['project_name'] = resp[ 'build_control']['project_name'] self._job = H2OJob(resp['job'], "AutoML") poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={}) try: self._job.poll(poll_updates=poll_updates) finally: poll_updates(self._job, 1) self._fetch()
def validation_frame(self, validation_frame): self._parms["validation_frame"] = H2OFrame._validate(validation_frame, 'validation_frame')
def calibration_frame(self, calibration_frame): self._parms["calibration_frame"] = H2OFrame._validate( calibration_frame, 'calibration_frame')