def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if isinstance(y, int): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not isinstance(x, (list,tuple)): x=[x] if isinstance(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights= kwargs["weights_column"] ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights])) kwargs["ignored_columns"] = None if ignored_columns==[] else [h2o.h2o._quoted(col) for col in ignored_columns] kwargs["interactions"] = None if ("interactions" not in kwargs or kwargs["interactions"] is None) else [h2o.h2o._quoted(col) for col in kwargs["interactions"]] kwargs = dict([(k, H2OEstimator._keyify_if_H2OFrame(kwargs[k])) for k in kwargs]) # gruesome one-liner algo = self._compute_algo() model = H2OJob(H2OConnection.post_json("ModelBuilders/"+algo, **kwargs), job_type=(algo+" Model Build")) if self._future: self._job = model return model.poll() if '_rest_version' in list(kwargs.keys()): model_json = H2OConnection.get_json("Models/"+model.dest_key, _rest_version=kwargs['_rest_version'])["models"][0] else: model_json = H2OConnection.get_json("Models/"+model.dest_key)["models"][0] self._resolve_model(model.dest_key,model_json)
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_type(y, int): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not is_type(x, list, tuple): x = [x] if is_type(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns] kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner algo = self.model._compute_algo() # unique to grid search if self.grid_id is not None: kwargs["grid_id"] = self.grid_id rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build")) if self._future: self._job = grid return grid.poll() grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key)) failure_messages_stacks = "" error_index = 0 if len(grid_json["failure_details"]) > 0: print("Errors/Warnings building gridsearch model\n") # will raise error if no grid model is returned, store error messages here for error_message in grid_json["failure_details"]: if isinstance(grid_json["failed_params"][error_index], dict): for h_name in grid_json['hyper_names']: print("Hyper-parameter: {0}, {1}".format(h_name, grid_json['failed_params'][error_index][h_name])) if len(grid_json["failure_stack_traces"]) > error_index: print("failure_details: {0}\nfailure_stack_traces: " "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index])) failure_messages_stacks += error_message+'\n' error_index += 1 self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc) # sometimes no model is returned due to bad parameter values provided by the user. if len(grid_json['model_ids']) > 0: first_model_json = h2o.api("GET /%d/Models/%s" % (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0] self._resolve_grid(grid.dest_key, grid_json, first_model_json) else: if len(failure_messages_stacks)>0: raise ValueError(failure_messages_stacks) else: raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
def deepfeatures(self, test_data, layer): """ Return hidden layer details :param test_data: Data to create a feature space on :param layer: 0 index hidden layer """ if test_data is None: raise ValueError("Must specify test data") j = H2OJob(h2o.H2OConnection.post_json("Predictions/models/" + self._id + "/frames/" + test_data.frame_id, deep_features_hidden_layer=layer, _rest_version=4), "deepfeatures") j.poll() return h2o.get_frame(j.dest_key)
def deepfeatures(self, test_data, layer): """ Return hidden layer details. :param test_data: Data to create a feature space on :param layer: 0 index hidden layer """ if test_data is None: raise ValueError("Must specify test data") j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self._id, test_data.frame_id), data={"deep_features_hidden_layer": layer}), "deepfeatures") j.poll() return h2o.get_frame(j.dest_key)
def predict(self, test_data): """ Predict on a dataset. :param H2OFrame test_data: Data on which to make predictions. :returns: A new H2OFrame of predictions. """ if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id)), self._model_json['algo'] + " prediction") j.poll() return h2o.get_frame(j.dest_key)
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if isinstance(y, int): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not isinstance(x, (list, tuple)): x = [x] if isinstance(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list( set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if ignored_columns == [] else [ h2o.h2o._quoted(col) for col in ignored_columns ] kwargs = dict([(k, kwargs[k].frame_id if isinstance( kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner algo = self.model._compute_algo() #unique to grid search kwargs["_rest_version"] = 99 #unique to grid search if self.grid_id is not None: kwargs["grid_id"] = self.grid_id grid = H2OJob(H2OConnection.post_json("Grid/" + algo, **kwargs), job_type=(algo + " Grid Build")) if self._future: self._job = grid return grid.poll() if '_rest_version' in list(kwargs.keys()): grid_json = H2OConnection.get_json( "Grids/" + grid.dest_key, _rest_version=kwargs['_rest_version']) for error_message in grid_json["failure_details"]: print(error_message) else: grid_json = H2OConnection.get_json("Grids/" + grid.dest_key) self.models = [ h2o.get_model(key['name']) for key in grid_json['model_ids'] ] #get first model returned in list of models from grid search to get model class (binomial, multinomial, etc) first_model_json = H2OConnection.get_json( "Models/" + grid_json['model_ids'][0]['name'], _rest_version=kwargs['_rest_version'])['models'][0] self._resolve_grid(grid.dest_key, grid_json, first_model_json)
def _train(self, parms, verbose=False): assert_is_type(verbose, bool) rest_ver = self._get_rest_version(parms) model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) job = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = job self._rest_version = rest_ver return job.poll(poll_updates=self._print_model_scoring_history if verbose else None) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, job.dest_key))["models"][0] self._resolve_model(job.dest_key, model_json)
def predict(self, test_data): """ Predict on a dataset. Parameters ---------- test_data: H2OFrame Data on which to make predictions. Returns ------- A new H2OFrame of predictions. """ if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = H2OJob(h2o.H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id, _rest_version=4), self._model_json['algo'] + " prediction") j.poll() return h2o.get_frame(j.dest_key)
def _train(self, parms, verbose=False): assert_is_type(verbose, bool) rest_ver = self._get_rest_version(parms) model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) job = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if model_builder_json["messages"] is not None: for mesg in model_builder_json["messages"]: if mesg["message_type"] == "WARN": warnings.warn(mesg["message"], RuntimeWarning) if self._future: self._job = job self._rest_version = rest_ver return job.poll(poll_updates=self._print_model_scoring_history if verbose else None) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, job.dest_key))["models"][0] self._resolve_model(job.dest_key, model_json)
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs["training_frame"] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_type(y, int): y = tframe.names[y] if y is not None: kwargs["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list( set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if ignored_columns == [] else [ quoted(col) for col in ignored_columns ] kwargs["interactions"] = (None if "interactions" not in kwargs or kwargs["interactions"] is None else [ quoted(col) for col in kwargs["interactions"] ]) kwargs = { k: H2OEstimator._keyify_if_h2oframe(kwargs[k]) for k in kwargs } rest_ver = kwargs.pop( "_rest_version") if "_rest_version" in kwargs else 3 model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=kwargs), job_type=(self.algo + " Model Build")) if self._future: self._job = model return model.poll() model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def store_h2o_frame(data, directory, filename, force=False, parts=1): """ Export a given H2OFrame to a path on the machine this python session is currently connected to. :param data: the Frame to save to disk. :param directory: the directory to the save point on disk. :param filename: the name to save the frame to. :param force: if True, overwrite any preexisting file with the same path :param parts: enables export to multiple 'part' files instead of just a single file. Convenient for large datasets that take too long to store in a single file. Use parts=-1 to instruct H2O to determine the optimal number of part files or specify your desired maximum number of part files. Path needs to be a directory when exporting to multiple files, also that directory must be empty. Default is ``parts = 1``, which is to export to a single file. :return string filepath: the path to which the file was stored. """ if not os.path.isdir(directory): os.makedirs(directory) filepath = _make_local_path(os.path.join(directory, filename)) from h2o.job import H2OJob from h2o.utils.typechecks import assert_is_type from h2o.frame import H2OFrame from h2o import api assert_is_type(data, H2OFrame) assert_is_type(filepath, str) assert_is_type(force, bool) assert_is_type(parts, int) H2OJob( api("POST /3/Frames/%s/export" % (data.frame_id), data={ "path": filepath, "num_parts": parts, "force": force }), "Export File").poll() return filepath
def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False, extend_parms_fn=None): has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None training_frame = H2OFrame._validate(training_frame, 'training_frame', required=self._requires_training_frame() and not has_default_training_frame) validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) assert_is_type(extend_parms_fn, None, FunctionType) override_default_training_frame = training_frame is not None if not override_default_training_frame: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) training_frame = self.training_frame if has_default_training_frame else None algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if algo=="pca" and "k" not in parms.keys(): parms["k"] = 1 if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"}) names = training_frame.names if training_frame is not None else [] ncols = training_frame.ncols if training_frame is not None else 0 types = training_frame.types if training_frame is not None else {} if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if override_default_training_frame: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is None and "ignored_columns" in parms: ignored_columns = parms['ignored_columns'] if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) self._check_and_save_parm(parms, "offset_column", offset_column) self._check_and_save_parm(parms, "weights_column", weights_column) self._check_and_save_parm(parms, "fold_column", fold_column) if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None and self.algo not in ["generic"]: raise ValueError("Missing response") # Step 3 if override_default_training_frame: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [names[i] for i in x] if override_default_training_frame: ignored_columns = list(set(names) - set(x + [y, offset, folds, weights] + self._additional_used_columns(parms))) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) # internal hook allowing subclasses to extend train parms if extend_parms_fn is not None: extend_parms_fn(parms) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(poll_updates=self._print_model_scoring_history if verbose else None) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if isinstance(y, int): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not isinstance(x, (list, tuple)): x = [x] if isinstance(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list( set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if ignored_columns == [] else [ h2o.h2o._quoted(col) for col in ignored_columns ] kwargs = dict([(k, kwargs[k].frame_id if isinstance( kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner algo = self.model._compute_algo() #unique to grid search kwargs["_rest_version"] = 99 #unique to grid search if self.grid_id is not None: kwargs["grid_id"] = self.grid_id grid = H2OJob(H2OConnection.post_json("Grid/" + algo, **kwargs), job_type=(algo + " Grid Build")) if self._future: self._job = grid return grid.poll() if '_rest_version' in list(kwargs.keys()): grid_json = H2OConnection.get_json( "Grids/" + grid.dest_key, _rest_version=kwargs['_rest_version']) error_index = 0 if len(grid_json["failure_details"]) > 0: print("Errors/Warnings building gridsearch model\n") for error_message in grid_json["failure_details"]: if isinstance(grid_json["failed_params"][error_index], dict): for h_name in grid_json['hyper_names']: print("Hyper-parameter: {0}, {1}".format( h_name, grid_json['failed_params'][error_index] [h_name])) print("failure_details: {0}\nfailure_stack_traces: " "{1}\n".format( error_message, grid_json['failure_stack_traces'][error_index])) error_index += 1 else: grid_json = H2OConnection.get_json("Grids/" + grid.dest_key) self.models = [ h2o.get_model(key['name']) for key in grid_json['model_ids'] ] #get first model returned in list of models from grid search to get model class (binomial, multinomial, etc) # sometimes no model is returned due to bad parameter values provided by the user. if len(grid_json['model_ids']) > 0: first_model_json = H2OConnection.get_json( "Models/" + grid_json['model_ids'][0]['name'], _rest_version=kwargs['_rest_version'])['models'][0] self._resolve_grid(grid.dest_key, grid_json, first_model_json) else: raise ValueError( "Gridsearch returns no model due to bad parameter values or other reasons...." )
def train(self, x=None, y=None, training_frame=None, fold_column=None, weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold_column or weights_column). :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used for early stopping of individual models and early stopping of the grid searches. By default and when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored. :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard. This is optional and if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard rankings instead. :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values). This is optional, but when provided, it is also recommended to disable cross validation by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch an AutoML run >>> aml.train(y=y, training_frame=train) """ training_frame = H2OFrame._validate(training_frame, 'training_frame', required=True) ncols = training_frame.ncols names = training_frame.names # Minimal required arguments are training_frame and y (response) if y is None: raise H2OValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.') else: assert_is_type(y,int,str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) input_spec = { 'response_column': y, } input_spec['training_frame'] = training_frame.frame_id if fold_column is not None: assert_is_type(fold_column,int,str) input_spec['fold_column'] = fold_column if weights_column is not None: assert_is_type(weights_column,int,str) input_spec['weights_column'] = weights_column if validation_frame is not None: validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') input_spec['validation_frame'] = validation_frame.frame_id if leaderboard_frame is not None: leaderboard_frame = H2OFrame._validate(leaderboard_frame, 'leaderboard_frame') input_spec['leaderboard_frame'] = leaderboard_frame.frame_id if blending_frame is not None: blending_frame = H2OFrame._validate(blending_frame, 'blending_frame') input_spec['blending_frame'] = blending_frame.frame_id if self.sort_metric is not None: assert_is_type(self.sort_metric, str) sort_metric = self.sort_metric.lower() # Changed the API to use "deviance" to be consistent with stopping_metric values # TO DO: let's change the backend to use "deviance" since we use the term "deviance" # After that we can take this `if` statement out if sort_metric == "deviance": sort_metric = "mean_residual_deviance" input_spec['sort_metric'] = sort_metric if x is not None: assert_is_type(x,list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) ignored_columns = set(names) - {y} - set(x) if fold_column is not None and fold_column in ignored_columns: ignored_columns.remove(fold_column) if weights_column is not None and weights_column in ignored_columns: ignored_columns.remove(weights_column) if ignored_columns is not None: input_spec['ignored_columns'] = list(ignored_columns) automl_build_params = dict(input_spec=input_spec) # NOTE: if the user hasn't specified some block of parameters don't send them! # This lets the back end use the defaults. automl_build_params['build_control'] = self.build_control automl_build_params['build_models'] = self.build_models resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: print("Exception from the back end: ") print(resp) return if not self.project_name: self.build_control['project_name'] = self.project_name = resp['build_control']['project_name'] self._job = H2OJob(resp['job'], "AutoML") poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={}) try: self._job.poll(poll_updates=poll_updates) finally: poll_updates(self._job, 1) self._fetch()
def partial_plot(self, data, cols, destination_key=None, nbins=20, plot=True, figsize=(7,10), server=False): """ Create partial dependence plot which gives a graphical depiction of the marginal effect of a variable on the response. The effect of a variable is measured in change in the mean response. :param H2OFrame data: An H2OFrame object used for scoring and constructing the plot. :param cols: Feature(s) for which partial dependence will be calculated. :param destination_key: An key reference to the created partial dependence tables in H2O. :param nbins: Number of bins used. :param plot: A boolean specifying whether to plot partial dependence table. :param figsize: Dimension/size of the returning plots, adjust to fit your output cells. :param server: ? :return: Plot and list of calculated mean response tables for each feature requested. """ if not isinstance(data, h2o.H2OFrame): raise ValueError("data must be an instance of H2OFrame") assert_is_type(cols, [str]) assert_is_type(destination_key, None, str) assert_is_type(nbins, int) assert_is_type(plot, bool) assert_is_type(figsize, (int,int)) ## Check cols specified exist in frame data for xi in cols: if not xi in data.names: raise H2OValueError("Column %s does not exist in the training frame" % xi) kwargs = {} kwargs['cols'] = cols kwargs['model_id'] = self.model_id kwargs['frame_id'] = data.frame_id kwargs['nbins'] = nbins kwargs['destination_key'] = destination_key json = H2OJob(h2o.api("POST /3/PartialDependence/", data=kwargs), job_type="PartialDependencePlot").poll() json = h2o.api("GET /3/PartialDependence/%s" % json.dest_key) # Extract partial dependence data from json response # pps = json pps = json['partial_dependence_data'] ## Plot partial dependence plots using matplotlib if plot: plt = _get_matplotlib_pyplot(server) if not plt: return fig, axs = plt.subplots(len(cols), squeeze=False, figsize=figsize) for i, pp in enumerate(pps): ## Check weather column was categorical or numeric col=cols[i] cat=data[col].isfactor()[0] if cat: labels = pp[0] x = range(len(labels)) y = pp[1] axs[i,0].plot(x, y, 'o') axs[i,0].set_xticks(x) axs[i,0].set_xticklabels(labels) axs[i,0].margins(0.2) else: axs[i,0].plot(pp[0], pp[1]) axs[i,0].set_xlim(min(pp[0]), max(pp[0])) axs[i,0].set_title('Partial Dependence Plot For {}'.format(col)) axs[i,0].set_xlabel(pp.col_header[0]) axs[i,0].set_ylabel(pp.col_header[1]) axs[i,0].xaxis.grid() axs[i,0].yaxis.grid() if len(col) >1: fig.tight_layout(pad = 0.4,w_pad=0.5, h_pad=1.0) return pps
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None): """ Train the H2O model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. """ assert_is_type(training_frame, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) algo = self.algo parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm", "word2vec"}) ncols = training_frame.ncols names = training_frame.names if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" elif y is not None: raise H2OValueError("y should not be provided for an unsupervised model") assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"pca", "svd", "kmeans", "glrm", "word2vec"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None: raise ValueError("Missing response") # Step 3 parms["training_frame"] = training_frame if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms), job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll() model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def train(self, x=None, y=None, training_frame=None, fold_column=None, weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold_column or weights_column). :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used for early stopping of individual models and early stopping of the grid searches. By default and when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored. :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard. This is optional and if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard rankings instead. :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values). This is optional, but when provided, it is also recommended to disable cross validation by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch an AutoML run >>> aml.train(y=y, training_frame=train) """ # Minimal required arguments are training_frame and y (response) self.training_frame = training_frame ncols = self.training_frame.ncols names = self.training_frame.names if y is None and self.response_column is None: raise H2OValueError( 'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.' ) elif y is not None: assert_is_type(y, int, str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError( "Column %s does not exist in the training frame" % y) self.response_column = y self.fold_column = fold_column self.weights_column = weights_column self.validation_frame = validation_frame self.leaderboard_frame = leaderboard_frame self.blending_frame = blending_frame if x is not None: assert_is_type(x, list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError( "Column %s not in the training frame" % xi) xset.add(xi) ignored_columns = set(names) - xset for col in [y, fold_column, weights_column]: if col is not None and col in ignored_columns: ignored_columns.remove(col) if ignored_columns is not None: self.input_spec['ignored_columns'] = list(ignored_columns) def clean_params(params): return ({ k: clean_params(v) for k, v in params.items() if v is not None } if isinstance(params, dict) else H2OEstimator._keyify(params)) automl_build_params = clean_params( dict( build_control=self.build_control, build_models=self.build_models, input_spec=self.input_spec, )) resp = self._build_resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: raise H2OResponseError( "Backend failed to build the AutoML job: {}".format(resp)) if not self.project_name: self.project_name = resp['build_control']['project_name'] self.__frozen = True self._job = H2OJob(resp['job'], "AutoML") poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={}) try: self._job.poll(poll_updates=poll_updates) finally: poll_updates(self._job, 1) self._fetch() return self.leader
def train(self, x = None, y = None, training_frame = None, fold_column = None, weights_column = None, validation_frame = None, leaderboard_frame=None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param validation_frame: H2OFrame with validation data to be scored on while training. :param leaderboard_frame: H2OFrame with test data to be scored on in the leaderboard. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch H2OAutoML >>> aml.train(y=y, training_frame=training_frame) """ ncols = training_frame.ncols names = training_frame.names #Minimal required arguments are training_frame and y (response) if y is None: raise ValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.') else: assert_is_type(y,int,str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) input_spec = { 'response_column': y, } if training_frame is None: raise ValueError('The training frame is not set!') else: assert_is_type(training_frame, H2OFrame) input_spec['training_frame'] = training_frame.frame_id if fold_column is not None: assert_is_type(fold_column,int,str) input_spec['fold_column'] = fold_column if weights_column is not None: assert_is_type(weights_column,int,str) input_spec['weights_column'] = weights_column if validation_frame is not None: assert_is_type(training_frame, H2OFrame) input_spec['validation_frame'] = validation_frame.frame_id if leaderboard_frame is not None: assert_is_type(training_frame, H2OFrame) input_spec['leaderboard_frame'] = leaderboard_frame.frame_id if x is not None: assert_is_type(x,list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) ignored_columns = set(names) - {y} - set(x) if fold_column is not None: ignored_columns = ignored_columns.remove(fold_column) if weights_column is not None: ignored_columns = ignored_columns.remove(weights_column) if ignored_columns is not None: input_spec['ignored_columns'] = list(ignored_columns) automl_build_params = dict(input_spec = input_spec) # NOTE: if the user hasn't specified some block of parameters don't send them! # This lets the back end use the defaults. automl_build_params['build_control'] = self.build_control resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: print("Exception from the back end: ") print(resp) return self._job = H2OJob(resp['job'], "AutoML") self._automl_key = self._job.dest_key self._job.poll() self._fetch()
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False): """ Train the H2O model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. :param bool verbose: Print scoring history to stdout. Defaults to False. """ assert_is_type(training_frame, None, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) if self._requires_training_frame() and training_frame is None: raise H2OValueError("Training frame required for %s algorithm, but none was given.", self.algo) training_frame_exists = training_frame is None if training_frame_exists: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}) if not training_frame_exists: names = training_frame.names ncols = training_frame.ncols if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if not training_frame_exists: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None: raise ValueError("Missing response") # Step 3 if not training_frame_exists: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] if not training_frame_exists: ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(verbose_model_scoring_history=verbose) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def train(self, x=None, y=None, training_frame=None, validation_frame=None, test_frame=None): """ Begins the automl task, which is a background task that incrementally improves over time. At any point, the user may use the "predict"/"performance" to inspect the incremental :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param validation_frame: H2OFrame with validation data to be scored on while training. :param test_frame: H2OFrame with test data to be scored on in the leaderboard. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> # Setting up an H2OAutoML object >>> build_control = { >>> 'stopping_criteria': { >>> 'stopping_rounds': 3, >>> 'stopping_tolerance': 0.001 >>> } >>> } >>> aml = H2OAutoML(max_runtime_secs=30, build_control=build_control) >>> # Launch H2OAutoML >>> aml.train(y=y, training_frame=training_frame) """ ncols = training_frame.ncols names = training_frame.names #Minimal required arguments are training_frame and y (response) if y is None: raise ValueError( 'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.' ) else: assert_is_type(y, int, str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError( "Column %s does not exist in the training frame" % y) input_spec = { 'response_column': y, } if training_frame is None: raise ValueError('The training frame is not set!') else: assert_is_type(training_frame, H2OFrame) input_spec['training_frame'] = training_frame.frame_id if validation_frame is not None: assert_is_type(training_frame, H2OFrame) input_spec['validation_frame'] = validation_frame.frame_id if test_frame is not None: assert_is_type(training_frame, H2OFrame) input_spec['test_frame'] = test_frame.frame_id if x is not None: assert_is_type(x, list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError( "Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) ignored_columns = set(names) - {y} - set(x) input_spec['ignored_columns'] = list(ignored_columns) automl_build_params = dict(input_spec=input_spec) # NOTE: if the user hasn't specified some block of parameters don't send them! # This lets the back end use the defaults. automl_build_params['build_control'] = self.build_control resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: print("Exception from the back end: ") print(resp) return self._job = H2OJob(resp['job'], "AutoML") self._automl_key = self._job.dest_key self._job.poll() self._fetch() if self.project_name is None: self.project_name = "automl_" + training_frame.frame_id
def train_segments(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, segments=None, segment_models_id=None, parallelism=1, verbose=False): """ Trains H2O model for each segment (subpopulation) of the training dataset. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for each model training. Use 0 to disable. Please note that regardless of how this parameter is set, a model will be built for each input segment. This parameter only affects individual model training. :param segments: A list of columns to segment-by. H2O will group the training (and validation) dataset by the segment-by columns and train a separate model for each segment (group of rows). As an alternative to providing a list of columns, users can also supply an explicit enumeration of segments to build the models for. This enumeration needs to be represented as H2OFrame. :param segment_models_id: Identifier for the returned collection of Segment Models. If not specified it will be automatically generated. :param parallelism: Level of parallelism of the bulk segment models building, it is the maximum number of models each H2O node will be building in parallel. :param bool verbose: Enable to print additional information during model building. Defaults to False. :examples: >>> response = "survived" >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic[response] = titanic[response].asfactor() >>> predictors = ["survived","name","sex","age","sibsp","parch","ticket","fare","cabin"] >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> titanic_gbm = H2OGradientBoostingEstimator(seed=1234) >>> titanic_models = titanic_gbm.train_segments(segments=["pclass"], ... x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_models.as_frame() """ assert_is_type(segments, None, H2OFrame, [str]) assert_is_type(verbose, bool) assert_is_type(segment_models_id, None, str) assert_is_type(parallelism, int) if segments is None: raise H2OValueError("Parameter segments was not specified. Please provide either a list of columns to " "segment-by or an explicit list of segments to build models for.") parms = self._make_parms(x=x, y=y, training_frame=training_frame, offset_column=offset_column, fold_column=fold_column, weights_column=weights_column, validation_frame=validation_frame, max_runtime_secs=max_runtime_secs, ignored_columns=ignored_columns, model_id=None, verbose=verbose) if isinstance(segments, H2OFrame): parms["segments"] = H2OEstimator._keyify(segments) else: parms["segment_columns"] = segments if segment_models_id: parms["segment_models_id"] = segment_models_id parms["parallelism"] = parallelism rest_ver = self._get_rest_version(parms) train_segments_response = h2o.api("POST /%d/SegmentModelsBuilders/%s" % (rest_ver, self.algo), data=parms) job = H2OJob(train_segments_response, job_type=(self.algo + " Segment Models Build")) job.poll() return H2OSegmentModels(job.dest_key)