def set_s3_credentials(secret_key_id, secret_access_key): """Creates a new Amazon S3 client internally with specified credentials. There are no validations done to the credentials. Incorrect credentials are thus revealed with first S3 import call. secretKeyId Amazon S3 Secret Key ID (provided by Amazon) secretAccessKey Amazon S3 Secret Access Key (provided by Amazon) """ if(secret_key_id is None): raise H2OValueError("Secret key ID must be specified") if(secret_access_key is None): raise H2OValueError("Secret access key must be specified") if(not secret_key_id): raise H2OValueError("Secret key ID must not be empty") if(not secret_access_key): raise H2OValueError("Secret access key must not be empty") params = {"secret_key_id": secret_key_id, "secret_access_key": secret_access_key } h2o.api(endpoint="POST /3/PersistS3", data=params) print("Credentials successfully set.")
def poll(self): """ Wait until the job finishes. This method will continuously query the server about the status of the job, until the job reaches a completion. During this time we will display (in stdout) a progress bar with % completion status. """ try: hidden = not H2OJob.__PROGRESS_BAR__ pb = ProgressBar(title=self._job_type + " progress", hidden=hidden) pb.execute(self._refresh_job_status) except StopIteration as e: if str(e) == "cancelled": h2o.api("POST /3/Jobs/%s/cancel" % self.job_key) self.status = "CANCELLED" # Potentially we may want to re-raise the exception here assert self.status in {"DONE", "CANCELLED", "FAILED"} or self._poll_count <= 0, \ "Polling finished while the job has status %s" % self.status if self.warnings: for w in self.warnings: warnings.warn(w) # check if failed... and politely print relevant message if self.status == "CANCELLED": raise H2OJobCancelled("Job<%s> was cancelled by the user." % self.job_key) if self.status == "FAILED": if (isinstance(self.job, dict)) and ("stacktrace" in list(self.job)): raise EnvironmentError("Job with key {} failed with an exception: {}\nstacktrace: " "\n{}".format(self.job_key, self.exception, self.job["stacktrace"])) else: raise EnvironmentError("Job with key %s failed with an exception: %s" % (self.job_key, self.exception)) return self
def poll(self): """ Wait until the job finishes. This method will continuously query the server about the status of the job, until the job reaches a completion. During this time we will display (in stdout) a progress bar with % completion status. """ try: pb = ProgressBar(self._job_type + " progress") pb.execute(self._refresh_job_status) except StopIteration as e: if str(e) == "cancelled": self.status = "CANCELLED" h2o.api("POST /3/Jobs/%s/cancel" % self.job_key) print("Job {} was cancelled.".format(self.job_key)) # Potentially we may want to re-raise the exception here if self.warnings: for w in self.warnings: warnings.warn(w) # TODO: this needs to br thought through more carefully # check if failed... and politely print relevant message if self.status == "CANCELLED": raise EnvironmentError("Job with key {} was cancelled by the user.".format(self.job_key)) if self.status == "FAILED": if (isinstance(self.job, dict)) and ("stacktrace" in list(self.job)): raise EnvironmentError("Job with key {} failed with an exception: {}\nstacktrace: " "\n{}".format(self.job_key, self.exception, self.job["stacktrace"])) else: raise EnvironmentError("Job with key %s failed with an exception: %s" % (self.job_key, self.exception)) return self
def signal_handler(self, signum, stackframe): """(internal).""" if self._polling: h2o.api("POST /3/Jobs/%s/cancel" % self.job_key) print("Job {} was cancelled.".format(self.job_key)) else: signal.default_int_handler()
def get_grid(self, sort_by=None, decreasing=None): """ Retrieve an H2OGridSearch instance. Optionally specify a metric by which to sort models and a sort order. Parameters ---------- sort_by : str, optional A metric by which to sort the models in the grid space. Choices are "logloss", "residual_deviance", "mse", "auc", "r2", "accuracy", "precision", "recall", "f1", etc. decreasing : bool, optional Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default). Returns ------- A new H2OGridSearch instance optionally sorted on the specified metric. """ if sort_by is None and decreasing is None: return self grid_json = h2o.api("GET /99/Grids/%s" % self._id, data={"sort_by": sort_by, "decreasing": decreasing}) grid = H2OGridSearch(self.model, self.hyper_params, self._id) grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] # reordered first_model_json = h2o.api("GET /99/Models/%s" % grid_json['model_ids'][0]['name'])['models'][0] model_class = H2OGridSearch._metrics_class(first_model_json) m = model_class() m._id = self._id m._grid_json = grid_json # m._metrics_class = metrics_class m._parms = grid._parms H2OEstimator.mixin(grid, model_class) grid.__dict__.update(m.__dict__.copy()) return grid
def get_grid(self, sort_by=None, decreasing=None): """ Retrieve an H2OGridSearch instance. Optionally specify a metric by which to sort models and a sort order. Note that if neither cross-validation nor a validation frame is used in the grid search, then the training metrics will display in the "get grid" output. If a validation frame is passed to the grid, and ``nfolds = 0``, then the validation metrics will display. However, if ``nfolds`` > 1, then cross-validation metrics will display even if a validation frame is provided. :param str sort_by: A metric by which to sort the models in the grid space. Choices are: ``"logloss"``, ``"residual_deviance"``, ``"mse"``, ``"auc"``, ``"r2"``, ``"accuracy"``, ``"precision"``, ``"recall"``, ``"f1"``, etc. :param bool decreasing: Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default). :returns: A new H2OGridSearch instance optionally sorted on the specified metric. """ if sort_by is None and decreasing is None: return self grid_json = h2o.api("GET /99/Grids/%s" % self._id, data={"sort_by": sort_by, "decreasing": decreasing}) grid = H2OGridSearch(self.model, self.hyper_params, self._id) grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] # reordered first_model_json = h2o.api("GET /99/Models/%s" % grid_json['model_ids'][0]['name'])['models'][0] model_class = H2OGridSearch._metrics_class(first_model_json) m = model_class() m._id = self._id m._grid_json = grid_json # m._metrics_class = metrics_class m._parms = grid._parms H2OEstimator.mixin(grid, model_class) grid.__dict__.update(m.__dict__.copy()) return grid
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_int(y): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not isinstance(x, (list,tuple)): x=[x] if is_int(x[0]): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights= kwargs["weights_column"] ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights])) kwargs["ignored_columns"] = None if ignored_columns==[] else [quoted(col) for col in ignored_columns] kwargs["interactions"] = None if ("interactions" not in kwargs or kwargs["interactions"] is None) else [quoted(col) for col in kwargs["interactions"]] kwargs = dict([(k, H2OEstimator._keyify_if_H2OFrame(kwargs[k])) for k in kwargs]) # gruesome one-liner rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else 3 algo = self._compute_algo() model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, algo), data=kwargs), job_type=(algo + " Model Build")) if self._future: self._job = model return model.poll() model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_type(y, int): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not is_type(x, list, tuple): x = [x] if is_type(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns] kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner algo = self.model._compute_algo() # unique to grid search if self.grid_id is not None: kwargs["grid_id"] = self.grid_id rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build")) if self._future: self._job = grid return grid.poll() grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key)) failure_messages_stacks = "" error_index = 0 if len(grid_json["failure_details"]) > 0: print("Errors/Warnings building gridsearch model\n") # will raise error if no grid model is returned, store error messages here for error_message in grid_json["failure_details"]: if isinstance(grid_json["failed_params"][error_index], dict): for h_name in grid_json['hyper_names']: print("Hyper-parameter: {0}, {1}".format(h_name, grid_json['failed_params'][error_index][h_name])) if len(grid_json["failure_stack_traces"]) > error_index: print("failure_details: {0}\nfailure_stack_traces: " "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index])) failure_messages_stacks += error_message+'\n' error_index += 1 self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc) # sometimes no model is returned due to bad parameter values provided by the user. if len(grid_json['model_ids']) > 0: first_model_json = h2o.api("GET /%d/Models/%s" % (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0] self._resolve_grid(grid.dest_key, grid_json, first_model_json) else: if len(failure_messages_stacks)>0: raise ValueError(failure_messages_stacks) else: raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
def is_running(self): """ Determine if the H2O cluster is running or not. :returns: True if the cluster is up; False otherwise """ try: if h2o.connection().local_server and not h2o.connection().local_server.is_running(): return False h2o.api("GET /") return True except (H2OConnectionError, H2OServerError): return False
def test_na_omits(): hf = h2o.H2OFrame({'A': [1, 'NA', 2], 'B': [1, 2, 3], 'C': [4, 5, 6]}) hf.summary() hf_col_summary = h2o.api("GET /3/Frames/%s/summary" % urllib.parse.quote(hf.frame_id))["frames"][0]["columns"] hf_col_summary = sum([e["missing_count"] for e in hf_col_summary]) assert hf_col_summary == 1 # we have one NAN here # attempt to remove it by calling na_omit hf_naomit = hf.na_omit() hf_naomit.summary() hf_naomit_col_summary = \ h2o.api("GET /3/Frames/%s/summary" % urllib.parse.quote(hf_naomit.frame_id))["frames"][0]["columns"] hf_naomit_col_summary = sum([e["missing_count"] for e in hf_naomit_col_summary]) assert hf_naomit_col_summary == 0 # we have removed the NAN row
def _refresh_job_status(self): jobs = h2o.api("GET /3/Jobs/%s" % self.job_key) self.job = jobs["jobs"][0] if "jobs" in jobs else jobs["job"][0] self.status = self.job["status"] self.progress = min(self.job["progress"], 1) self.exception = self.job["exception"] self.warnings = self.job["warnings"] if "warnings" in self.job else None
def test_api_timestamp(): prostate_train = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv")) prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor() ntrees = 1 learning_rate = 0.1 depth = 5 min_rows = 10 # Build H2O GBM classification model: gbm_h2o = H2OGradientBoostingEstimator(ntrees=ntrees, learn_rate=learning_rate, max_depth=depth, min_rows=min_rows, distribution="bernoulli", model_id="test_timestamp") gbm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train) model = h2o.get_model(model_id="test_timestamp") models = h2o.api("GET /3/Models") assert model._model_json['timestamp'] == models["models"][0]["timestamp"], "Timestamp should be the same." assert gbm_h2o.start_time is not None and gbm_h2o.start_time > 0 assert gbm_h2o.end_time is not None and gbm_h2o.end_time > 0 assert gbm_h2o.run_time is not None and gbm_h2o.run_time > 0 assert gbm_h2o.end_time - gbm_h2o.start_time == gbm_h2o.run_time
def _resolve_model(future_model, **kwargs): future_model.poll() rest_ver = kwargs["_rest_version"] if "_rest_version" in kwargs else 3 model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, future_model.job.dest_key))["models"][0] model_type = model_json["output"]["model_category"] if model_type == "Binomial": model = H2OBinomialModel(future_model.job.dest_key, model_json) elif model_type == "Clustering": model = H2OClusteringModel(future_model.job.dest_key, model_json) elif model_type == "Regression": model = H2ORegressionModel(future_model.job.dest_key, model_json) elif model_type == "Multinomial": model = H2OMultinomialModel(future_model.job.dest_key, model_json) elif model_type == "Ordinal": model = H2OOrdinalModel(future_model.job.dest_key, model_json) elif model_type == "AutoEncoder": model = H2OAutoEncoderModel(future_model.job.dest_key, model_json) elif model_type == "DimReduction": model = H2ODimReductionModel(future_model.job.dest_key, model_json) elif model_type == "WordEmbedding": model = H2OWordEmbeddingModel(future_model.job.dest_key, model_json) else: raise NotImplementedError(model_type) return model
def get_automl(project_name): """ Retrieve information about an AutoML instance. :param str project_name: A string indicating the project_name of the automl instance to retrieve. :returns: A dictionary containing the project_name, leader model, and leaderboard. """ automl_json = h2o.api("GET /99/AutoML/%s" % project_name) project_name = automl_json["project_name"] leaderboard_list = [key["name"] for key in automl_json['leaderboard']['models']] if leaderboard_list is not None and len(leaderboard_list) > 0: leader_id = leaderboard_list[0] else: leader_id = None leader = h2o.get_model(leader_id) # Intentionally mask the progress bar here since showing multiple progress bars is confusing to users. # If any failure happens, revert back to user's original setting for progress and display the error message. is_progress = H2OJob.__PROGRESS_BAR__ h2o.no_progress() try: # Parse leaderboard H2OTwoDimTable & return as an H2OFrame leaderboard = h2o.H2OFrame( automl_json["leaderboard_table"].cell_values, column_names=automl_json["leaderboard_table"].col_header) except Exception as ex: raise ex finally: if is_progress is True: h2o.show_progress() leaderboard = leaderboard[1:] automl_dict = {'project_name': project_name, "leader": leader, "leaderboard": leaderboard} return automl_dict
def confusion_matrix(self, data): """ Returns a confusion matrix based of H2O's default prediction threshold for a dataset """ assert_is_type(data, H2OFrame) j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self._id, data.frame_id)) return j["model_metrics"][0]["cm"]["table"]
def _fetch(self): res = h2o.api("GET /99/AutoML/" + self._automl_key) leaderboard_list = [key["name"] for key in res['leaderboard']['models']] if leaderboard_list is not None and len(leaderboard_list) > 0: self._leader_id = leaderboard_list[0] else: self._leader_id = None # Intentionally mask the progress bar here since showing multiple progress bars is confusing to users. # If any failure happens, revert back to user's original setting for progress and display the error message. is_progress = H2OJob.__PROGRESS_BAR__ h2o.no_progress() try: # Parse leaderboard H2OTwoDimTable & return as an H2OFrame leaderboard = h2o.H2OFrame( res["leaderboard_table"].cell_values, column_names=res["leaderboard_table"].col_header) except Exception as ex: raise ex finally: if is_progress is True: h2o.show_progress() self._leaderboard = leaderboard[1:] return self._leader_id is not None
def download_mojo(self, path=".", get_genmodel_jar=False): """ Download the model in MOJO format. :param path: the path where MOJO file should be saved. :param get_genmodel_jar: if True, then also download h2o-genmodel.jar and store it in folder ``path``. :returns: name of the MOJO file written. """ assert_is_type(path, str) assert_is_type(get_genmodel_jar, bool) if self.algo not in {"drf", "gbm", "deepwater", "glrm"}: raise H2OValueError("MOJOs are currently supported for Distributed Random Forest, " "Gradient Boosting Machine, Deep Water and GLRM models only.") if get_genmodel_jar: h2o.api("GET /3/h2o-genmodel.jar", save_to=os.path.join(path, "h2o-genmodel.jar")) return h2o.api("GET /3/Models/%s/mojo" % self.model_id, save_to=path)
def _model_build(x, y, tframe, vframe, algo, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if y is not None: kwargs['response_column'] = tframe[y].names[0] kwargs = dict([(k, (kwargs[k]._frame()).frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else 3 future_model = H2OModelFuture(H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, algo), data=kwargs), job_type=(algo+" Model Build")), x) return _resolve_model(future_model, _rest_version=rest_ver, **kwargs)
def fit(self, fr, **fit_params): res = [] for step in self.steps: res.append(step[1].to_rest(step[0])) res = "[" + ",".join([quoted(r.replace('"', "'")) for r in res]) + "]" j = h2o.api("POST /99/Assembly", data={"steps": res, "frame": fr.frame_id}) self.id = j["assembly"]["name"] return H2OFrame.get_frame(j["result"]["name"])
def join(self): """Wait until job's completion.""" self._future = False self._job.poll() model_key = self._job.dest_key self._job = None model_json = h2o.api("GET /%d/Models/%s" % (self._rest_version, model_key))["models"][0] self._resolve_model(model_key, model_json)
def confusion_matrix(self, data): """ Returns a confusion matrix based of H2O's default prediction threshold for a dataset. :param H2OFrame data: the frame with the prediction results for which the confusion matrix should be extracted. """ assert_is_type(data, H2OFrame) j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self._id, data.frame_id)) return j["model_metrics"][0]["cm"]["table"]
def rapids(expr): """ Execute a Rapids expression. :param expr: The rapids expression (ascii string). :returns: The JSON response (as a python dictionary) of the Rapids execution """ return h2o.api("POST /99/Rapids", data={"ast": expr, "session_id": h2o.connection().session_id})
def available(): """Returns True if a deep water model can be built, or False otherwise.""" builder_json = h2o.api("GET /3/ModelBuilders", data={"algo": "deepwater"}) visibility = builder_json["model_builders"]["deepwater"]["visibility"] if visibility == "Experimental": print("Cannot build a Deep Water model - no backend found.") return False else: return True
def shutdown(self, prompt=False): """ Shut down the server. This method checks if the H2O cluster is still running, and if it does shuts it down (via a REST API call). :param prompt: A logical value indicating whether to prompt the user before shutting down the H2O server. """ if not self.is_running(): return assert_is_type(prompt, bool) if prompt: question = "Are you sure you want to shutdown the H2O instance running at %s (Y/N)? " \ % h2o.connection().base_url response = input(question) # works in Py2 & Py3 because redefined in h2o.utils.compatibility module else: response = "Y" if response.lower() in {"y", "yes"}: h2o.api("POST /3/Shutdown") h2o.connection().close()
def _refresh_job_status(self): if self._poll_count <= 0: raise StopIteration("") jobs = h2o.api("GET /3/Jobs/%s" % self.job_key) self.job = jobs["jobs"][0] if "jobs" in jobs else jobs["job"][0] self.status = self.job["status"] self.progress = min(self.job["progress"], 1) self.exception = self.job["exception"] self.warnings = self.job["warnings"] if "warnings" in self.job else None self._poll_count -= 1 return self.progress
def fill(self, rows=10): assert self._id is not None if self._data is not None: if rows <= len(self): return res = h2o.api("GET /3/Frames/%s" % self._id, data={"row_count": rows})["frames"][0] self._l = rows self._nrows = res["rows"] self._ncols = res["total_column_count"] self._names = [c["label"] for c in res["columns"]] self._types = dict(zip(self._names, [c["type"] for c in res["columns"]])) self._fill_data(res)
def fit(self, fr): """ To perform the munging operations on a frame specified in steps on the frame fr. :param fr: H2OFrame where munging operations are to be performed on. :return: H2OFrame after munging operations are completed. """ assert_is_type(fr, H2OFrame) steps = "[%s]" % ",".join(quoted(step[1].to_rest(step[0]).replace('"', "'")) for step in self.steps) j = h2o.api("POST /99/Assembly", data={"steps": steps, "frame": fr.frame_id}) self.id = j["assembly"]["name"] return H2OFrame.get_frame(j["result"]["name"])
def deepfeatures(self, test_data, layer): """ Return hidden layer details. :param test_data: Data to create a feature space on :param layer: 0 index hidden layer """ if test_data is None: raise ValueError("Must specify test data") j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self._id, test_data.frame_id), data={"deep_features_hidden_layer": layer}), "deepfeatures") j.poll() return h2o.get_frame(j.dest_key)
def predict_leaf_node_assignment(self, test_data): """ Predict on a dataset and return the leaf node assignment (only for tree-based models). :param H2OFrame test_data: Data on which to make predictions. :returns: A new H2OFrame of predictions. """ if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id), data={"leaf_node_assignment": True}) return h2o.get_frame(j["predictions_frame"]["name"])
def predict(self, test_data): """ Predict on a dataset. :param H2OFrame test_data: Data on which to make predictions. :returns: A new H2OFrame of predictions. """ if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id)), self._model_json['algo'] + " prediction") j.poll() return h2o.get_frame(j.dest_key)
def fit(self, fr): """ To perform the munging operations on a frame specified in steps on the frame fr. :param fr: H2OFrame where munging operations are to be performed on. :return: H2OFrame after munging operations are completed. :examples: >>> iris = h2o.load_dataset("iris") >>> assembly = H2OAssembly(steps=[("col_select", ... H2OColSelect(["Sepal.Length", ... "Petal.Length", "Species"])), ... ("cos_Sepal.Length", ... H2OColOp(op=H2OFrame.cos, ... col="Sepal.Length", ... inplace=True)), ... ("str_cnt_Species", ... H2OColOp(op=H2OFrame.countmatches, ... col="Species", ... inplace=False, ... pattern="s"))]) >>> fit = assembly.fit(iris) >>> fit """ assert_is_type(fr, H2OFrame) steps = "[%s]" % ",".join( quoted(step[1].to_rest(step[0]).replace('"', "'")) for step in self.steps) j = h2o.api("POST /99/Assembly", data={ "steps": steps, "frame": fr.frame_id }) self.id = j["assembly"]["name"] return H2OFrame.get_frame(j["result"]["name"])
def transform(self, frame, data_leakage_handling="None", noise=-1, seed=-1): """ Apply transformation to `te_columns` based on the encoding maps generated during `trains()` method call. :param H2OFrame frame: to which frame we are applying target encoding transformations. :param str data_leakage_handling: Supported options: 1) "KFold" - encodings for a fold are generated based on out-of-fold data. 2) "LeaveOneOut" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies. 3) "None" - we do not holdout anything. Using whole frame for training :param float noise: the amount of random noise added to the target encoding. This helps prevent overfitting. Defaults to 0.01 * range of y. :param int seed: a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1. :example: >>> targetEncoder = TargetEncoder(encoded_columns=te_columns, target_column=responseColumnName, blended_avg=True, inflection_point=10, smoothing=20) >>> encodedTrain = targetEncoder.transform(frame=trainFrame, data_leakage_handling="None", seed=1234, is_train_or_valid=True) """ output = h2o.api("GET /3/TargetEncoderTransform", data={'model': self.model_id, 'frame': frame.key, 'data_leakage_handling': data_leakage_handling, 'noise': noise, 'seed': seed}) return h2o.get_frame(output["name"])
def transform(self, words, aggregate_method): """ Transform words (or sequences of words) to vectors using a word2vec model. :param str words: An H2OFrame made of a single column containing source words. :param str aggregate_method: Specifies how to aggregate sequences of words. If method is `NONE` then no aggregation is performed and each input word is mapped to a single word-vector. If method is 'AVERAGE' then input is treated as sequences of words delimited by NA. Each word of a sequences is internally mapped to a vector and vectors belonging to the same sentence are averaged and returned in the result. :returns: the approximate reconstruction of the training data. :examples: >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> STOP_WORDS = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what", ... "there","all","we","one","the","a","an","of","or","in","for","by","on", ... "but","is","in","a","not","with","as","was","if","they","are","this","and","it","have", ... "from","at","my","be","by","not","that","to","from","com","org","like","likes","so"] >>> words = job_titles.tokenize(" ") >>> words = words[(words.isna()) | (~ words.isin(STOP_WORDS)),:] >>> w2v_model = H2OWord2vecEstimator(epochs = 10) >>> w2v_model.train(training_frame=words) >>> job_title_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE") """ j = h2o.api("GET /3/Word2VecTransform", data={ 'model': self.model_id, 'words_frame': words.frame_id, 'aggregate_method': aggregate_method }) return h2o.get_frame(j["vectors_frame"]["name"])
def _resolve_model(future_model, **kwargs): future_model.poll() rest_ver = kwargs["_rest_version"] if "_rest_version" in kwargs else 3 model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, future_model.job.dest_key))["models"][0] model_type = model_json["output"]["model_category"] if model_type == "Binomial": model = H2OBinomialModel(future_model.job.dest_key, model_json) elif model_type == "Clustering": model = H2OClusteringModel(future_model.job.dest_key, model_json) elif model_type == "Regression": model = H2ORegressionModel(future_model.job.dest_key, model_json) elif model_type == "Multinomial": model = H2OMultinomialModel(future_model.job.dest_key, model_json) elif model_type == "AutoEncoder": model = H2OAutoEncoderModel(future_model.job.dest_key, model_json) elif model_type == "DimReduction": model = H2ODimReductionModel(future_model.job.dest_key, model_json) elif model_type == "WordEmbedding": model = H2OWordEmbeddingModel(future_model.job.dest_key, model_json) else: raise NotImplementedError(model_type) return model
def __init__(self, model, tree_number, tree_class=None): params = {"model": model.model_id, "tree_number": tree_number, "tree_class": tree_class} response = h2o.api(endpoint="GET /3/Tree", data=params) self._left_children = response['left_children'] self._right_children = response['right_children'] self._node_ids = self.__extract_internal_ids(response['root_node_id']) self._descriptions = response['descriptions'] self._model_id = model.model_id self._tree_number = response['tree_number'] self._tree_class = response['tree_class'] self._thresholds = self.__convert_threshold_nans(response['thresholds']) self._features = response['features'] self._levels = self.__decode_categoricals(model, response['levels']) self._nas = response['nas'] self._predictions = response['predictions'] self._root_node = self.__assemble_tree(0) self._tree_decision_path = response['tree_decision_path'] self._decision_paths = response['decision_paths'] (left, right) = self.__per_node_cat_splits() self._left_cat_split = left self._right_cat_split = right
def model_performance(self, test_data=None, train=False, valid=False, xval=False): """ Generate model metrics for this model on test_data. Parameters ---------- test_data: H2OFrame, optional Data set for which model metrics shall be computed against. All three of train, valid and xval arguments are ignored if test_data is not None. train: boolean, optional Report the training metrics for the model. valid: boolean, optional Report the validation metrics for the model. xval: boolean, optional Report the cross-validation metrics for the model. If train and valid are True, then it defaults to True. :returns: An object of class H2OModelMetrics. """ if test_data is None: if not train and not valid and not xval: train = True # default to train if train: return self._model_json["output"]["training_metrics"] if valid: return self._model_json["output"]["validation_metrics"] if xval: return self._model_json["output"]["cross_validation_metrics"] else: # cases dealing with test_data not None if not isinstance(test_data, h2o.H2OFrame): raise ValueError("`test_data` must be of type H2OFrame. Got: " + type(test_data)) res = h2o.api("POST /3/ModelMetrics/models/%s/frames/%s" % (self.model_id, test_data.frame_id)) # FIXME need to do the client-side filtering... (PUBDEV-874) raw_metrics = None for mm in res["model_metrics"]: if mm["frame"] is not None and mm["frame"]["name"] == test_data.frame_id: raw_metrics = mm break return self._metrics_class(raw_metrics, algo=self._model_json["algo"])
def __init__(self, nfolds=5, balance_classes=False, class_sampling_factors=None, max_after_balance_size=5.0, max_runtime_secs=3600, max_models=None, stopping_metric="AUTO", stopping_tolerance=None, stopping_rounds=3, seed=None, project_name=None, exclude_algos=None, keep_cross_validation_predictions=True, keep_cross_validation_models=True, sort_metric="AUTO"): # Check if H2O jar contains AutoML try: h2o.api("GET /3/Metadata/schemas/AutoMLV99") except h2o.exceptions.H2OResponseError as e: print(e) print("*******************************************************************\n" \ "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \ "*******************************************************************\n" \ "\nVerbose Error Message:") # Make bare minimum build_control (if max_runtimes_secs is an invalid value, it will catch below) self.build_control = { 'stopping_criteria': { 'max_runtime_secs': max_runtime_secs, } } # Make bare minimum build_models self.build_models = { 'exclude_algos': None # [ "GLM", "DRF", "GBM", "DeepLearning", "StackedEnsemble"] } # nfolds must be an non-negative integer and not equal to 1: if nfolds is not 5: assert_is_type(nfolds,int) assert nfolds >= 0, "nfolds set to " + str(nfolds) + "; nfolds cannot be negative. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable." assert nfolds is not 1, "nfolds set to " + str(nfolds) + "; nfolds = 1 is an invalid value. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable." self.build_control["nfolds"] = nfolds self.nfolds = nfolds # Pass through to all algorithms if balance_classes is True: self.build_control["balance_classes"] = balance_classes self.balance_classes = balance_classes if class_sampling_factors is not None: self.build_control["class_sampling_factors"] = class_sampling_factors self.class_sampling_factors = class_sampling_factors if max_after_balance_size != 5.0: assert_is_type(max_after_balance_size,float) self.build_control["max_after_balance_size"] = max_after_balance_size self.max_after_balance_size = max_after_balance_size # If max_runtime_secs is not provided, then it is set to default (3600 secs) if max_runtime_secs is not 3600: assert_is_type(max_runtime_secs,int) self.max_runtime_secs = max_runtime_secs # Add other parameters to build_control if available if max_models is not None: assert_is_type(max_models,int) self.build_control["stopping_criteria"]["max_models"] = max_models self.max_models = max_models if stopping_metric is not "AUTO": assert_is_type(stopping_metric,str) self.build_control["stopping_criteria"]["stopping_metric"] = stopping_metric self.stopping_metric = stopping_metric if stopping_tolerance is not None: assert_is_type(stopping_tolerance,float) self.build_control["stopping_criteria"]["stopping_tolerance"] = stopping_tolerance self.stopping_tolerence = stopping_tolerance if stopping_rounds is not 3: assert_is_type(stopping_rounds,int) self.build_control["stopping_criteria"]["stopping_rounds"] = stopping_rounds self.stopping_rounds = stopping_rounds if seed is not None: assert_is_type(seed,int) self.build_control["stopping_criteria"]["seed"] = seed self.seed = seed # Set project name if provided. If None, then we set in .train() to "automl_" + training_frame.frame_id if project_name is not None: assert_is_type(project_name,str) self.build_control["project_name"] = project_name self.project_name = project_name else: self.project_name = None if exclude_algos is not None: assert_is_type(exclude_algos,list) for elem in exclude_algos: assert_is_type(elem,str) self.build_models['exclude_algos'] = exclude_algos assert_is_type(keep_cross_validation_predictions, bool) self.build_control["keep_cross_validation_predictions"] = keep_cross_validation_predictions assert_is_type(keep_cross_validation_models, bool) self.build_control["keep_cross_validation_models"] = keep_cross_validation_models self._job = None self._leader_id = None self._leaderboard = None if sort_metric == "AUTO": self.sort_metric = None else: self.sort_metric = sort_metric
def train(self, x=None, y=None, training_frame=None, validation_frame=None, test_frame=None): """ Begins the automl task, which is a background task that incrementally improves over time. At any point, the user may use the "predict"/"performance" to inspect the incremental :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param validation_frame: H2OFrame with validation data to be scored on while training. :param test_frame: H2OFrame with test data to be scored on in the leaderboard. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> # Setting up an H2OAutoML object >>> build_control = { >>> 'stopping_criteria': { >>> 'stopping_rounds': 3, >>> 'stopping_tolerance': 0.001 >>> } >>> } >>> aml = H2OAutoML(max_runtime_secs=30, build_control=build_control) >>> # Launch H2OAutoML >>> aml.train(y=y, training_frame=training_frame) """ ncols = training_frame.ncols names = training_frame.names #Minimal required arguments are training_frame and y (response) if y is None: raise ValueError( 'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.' ) else: assert_is_type(y, int, str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError( "Column %s does not exist in the training frame" % y) input_spec = { 'response_column': y, } if training_frame is None: raise ValueError('The training frame is not set!') else: assert_is_type(training_frame, H2OFrame) input_spec['training_frame'] = training_frame.frame_id if validation_frame is not None: assert_is_type(training_frame, H2OFrame) input_spec['validation_frame'] = validation_frame.frame_id if test_frame is not None: assert_is_type(training_frame, H2OFrame) input_spec['test_frame'] = test_frame.frame_id if x is not None: assert_is_type(x, list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError( "Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) ignored_columns = set(names) - {y} - set(x) input_spec['ignored_columns'] = list(ignored_columns) automl_build_params = dict(input_spec=input_spec) # NOTE: if the user hasn't specified some block of parameters don't send them! # This lets the back end use the defaults. automl_build_params['build_control'] = self.build_control resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: print("Exception from the back end: ") print(resp) return self._job = H2OJob(resp['job'], "AutoML") self._automl_key = self._job.dest_key self._job.poll() self._fetch() if self.project_name is None: self.project_name = "automl_" + training_frame.frame_id
def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False, extend_parms_fn=None): has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None training_frame = H2OFrame._validate(training_frame, 'training_frame', required=self._requires_training_frame() and not has_default_training_frame) validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) assert_is_type(extend_parms_fn, None, FunctionType) override_default_training_frame = training_frame is not None if not override_default_training_frame: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) training_frame = self.training_frame if has_default_training_frame else None algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if algo=="pca" and "k" not in parms.keys(): parms["k"] = 1 if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"}) names = training_frame.names if training_frame is not None else [] ncols = training_frame.ncols if training_frame is not None else 0 types = training_frame.types if training_frame is not None else {} if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if override_default_training_frame: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is None and "ignored_columns" in parms: ignored_columns = parms['ignored_columns'] if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) self._check_and_save_parm(parms, "offset_column", offset_column) self._check_and_save_parm(parms, "weights_column", weights_column) self._check_and_save_parm(parms, "fold_column", fold_column) if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None and self.algo not in ["generic"]: raise ValueError("Missing response") # Step 3 if override_default_training_frame: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [names[i] for i in x] if override_default_training_frame: ignored_columns = list(set(names) - set(x + [y, offset, folds, weights] + self._additional_used_columns(parms))) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) # internal hook allowing subclasses to extend train parms if extend_parms_fn is not None: extend_parms_fn(parms) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(poll_updates=self._print_model_scoring_history if verbose else None) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def network_test(self): """Test network connectivity.""" res = h2o.api("GET /3/NetworkTest") res["table"].show()
def _list_extensions(self, endpoint): res = h2o.api("GET /3/" + endpoint)["capabilities"] return [x["name"] for x in res]
def train(self, x=None, y=None, training_frame=None, fold_column=None, weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold_column or weights_column). :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used for early stopping of individual models and early stopping of the grid searches. By default and when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored. :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard. This is optional and if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard rankings instead. :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values). This is optional, but when provided, it is also recommended to disable cross validation by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch an AutoML run >>> aml.train(y=y, training_frame=train) """ training_frame = H2OFrame._validate(training_frame, 'training_frame', required=True) ncols = training_frame.ncols names = training_frame.names # Minimal required arguments are training_frame and y (response) if y is None: raise H2OValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.') else: assert_is_type(y,int,str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) input_spec = { 'response_column': y, } input_spec['training_frame'] = training_frame.frame_id if fold_column is not None: assert_is_type(fold_column,int,str) input_spec['fold_column'] = fold_column if weights_column is not None: assert_is_type(weights_column,int,str) input_spec['weights_column'] = weights_column if validation_frame is not None: validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') input_spec['validation_frame'] = validation_frame.frame_id if leaderboard_frame is not None: leaderboard_frame = H2OFrame._validate(leaderboard_frame, 'leaderboard_frame') input_spec['leaderboard_frame'] = leaderboard_frame.frame_id if blending_frame is not None: blending_frame = H2OFrame._validate(blending_frame, 'blending_frame') input_spec['blending_frame'] = blending_frame.frame_id if self.sort_metric is not None: assert_is_type(self.sort_metric, str) sort_metric = self.sort_metric.lower() # Changed the API to use "deviance" to be consistent with stopping_metric values # TO DO: let's change the backend to use "deviance" since we use the term "deviance" # After that we can take this `if` statement out if sort_metric == "deviance": sort_metric = "mean_residual_deviance" input_spec['sort_metric'] = sort_metric if x is not None: assert_is_type(x,list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) ignored_columns = set(names) - {y} - set(x) if fold_column is not None and fold_column in ignored_columns: ignored_columns.remove(fold_column) if weights_column is not None and weights_column in ignored_columns: ignored_columns.remove(weights_column) if ignored_columns is not None: input_spec['ignored_columns'] = list(ignored_columns) automl_build_params = dict(input_spec=input_spec) # NOTE: if the user hasn't specified some block of parameters don't send them! # This lets the back end use the defaults. automl_build_params['build_control'] = self.build_control automl_build_params['build_models'] = self.build_models resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: print("Exception from the back end: ") print(resp) return if not self.project_name: self.build_control['project_name'] = self.project_name = resp['build_control']['project_name'] self._job = H2OJob(resp['job'], "AutoML") poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={}) try: self._job.poll(poll_updates=poll_updates) finally: poll_updates(self._job, 1) self._fetch()
def test_zipped_rf_model(): """ Test the correctness of the "zipped" model format. This test will create a random dataset, split into training/testing part, train a DRF model on it, download the model's data, score the model remotely and fetch the predictions, score the model locally by running the genmodel jar, and finally compare the prediction results. """ genmodel_jar = os.path.abspath("../../../h2o-genmodel/build/libs/h2o-genmodel-all.jar") assert os.path.exists(genmodel_jar), "Cannot find " + genmodel_jar target_dir = "" if sys.platform == "win32": target_dir = tempfile.mkdtemp() else: target_dir = os.path.expanduser("~/Downloads/") report = [] for estimator in [H2ORandomForestEstimator, H2OGradientBoostingEstimator]: print(colorama.Fore.LIGHTYELLOW_EX + "\n#================================================") print("# Estimator: " + estimator.__name__) print("#================================================\n" + colorama.Fore.RESET) estimator_name = "GBM" if estimator == H2OGradientBoostingEstimator else "DRF" for problem in ["binomial", "multinomial", "regression"]: print("========================") print("%s problem" % problem.capitalize()) print("========================") df = random_dataset(problem, verbose=False) print("Created dataset with %d rows x %d columns" % (df.nrow, df.ncol)) test = df[:NTESTROWS, :] train = df[NTESTROWS:, :] test2 = test.rbind(test) time0 = time.time() print("\n\nTraining Random Forest model...") model = estimator(ntrees=NTREES, max_depth=DEPTH) model.train(training_frame=train) print(model.summary()) print("Time taken = %.3fs" % (time.time() - time0)) print("\nSaving the model...") time0 = time.time() model_file = h2o.api("GET /3/Models/%s/data" % model.model_id, save_to=target_dir) print(" => %s (%d bytes)" % (model_file, os.stat(model_file).st_size)) assert os.path.exists(model_file) print("Time taken = %.3fs" % (time.time() - time0)) print("\nDownloading POJO...") time0 = time.time() pojo_file = h2o.download_pojo(model, target_dir, get_jar=False) pojo_size = os.stat(pojo_file).st_size pojo_name = os.path.splitext(os.path.basename(pojo_file))[0] print(" => %s (%d bytes)" % (pojo_file, pojo_size)) print("Time taken = %.3fs" % (time.time() - time0)) print("\nDownloading the test datasets for local use: ", end="") time0 = time.time() test_file = os.path.join(target_dir, "test_%s.csv" % test.frame_id) test2_file = os.path.join(target_dir, "test2_%s.csv" % test2.frame_id) print(test_file) h2o.download_csv(test, test_file) h2o.download_csv(test2, test2_file) print("Time taken = %.3fs" % (time.time() - time0)) print("\nScoring the model remotely and downloading to file ", end="") times = [time.time()] h2o_pred_file = os.path.join(target_dir, "predR_%s.csv" % test.frame_id) h2o_pred_file2 = os.path.join(target_dir, "predR_%s.csv" % test2.frame_id) print(h2o_pred_file) for testframe, outfile in [(test, h2o_pred_file), (test2, h2o_pred_file2)]: predictions = model.predict(testframe) h2o.download_csv(predictions, outfile) times.append(time.time()) print("Time taken = %.3fs (1st run: %.3f, 2nd run: %.3f)" % (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1])) report.append((estimator_name, problem, "Server", times[1] - times[0], times[2] - times[1])) print("\nScoring the model locally and saving to file ", end="") times = [time.time()] local_pred_file = os.path.join(target_dir, "predL_%s.csv" % test.frame_id) local_pred_file2 = os.path.join(target_dir, "predL_%s.csv" % test2.frame_id) print(local_pred_file) for inpfile, outfile in [(test_file, local_pred_file), (test2_file, local_pred_file2)]: load_csv(inpfile) ret = subprocess.call(["java", "-cp", genmodel_jar, "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--input", inpfile, "--output", outfile, "--model", model_file, "--decimal"]) assert ret == 0, "GenModel finished with return code %d" % ret times.append(time.time()) print("Time taken = %.3fs (1st run: %.3f, 2nd run: %.3f)" % (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1])) report.append((estimator_name, problem, "Zipped", times[1] - times[0], times[2] - times[1])) if pojo_size <= 1000 << 20: # 1000 Mb time0 = time.time() print("\nCompiling Java Pojo") javac_cmd = ["javac", "-cp", genmodel_jar, "-J-Xmx12g", pojo_file] subprocess.check_call(javac_cmd) print("Time taken = %.3fs" % (time.time() - time0)) pojo_pred_file = os.path.join(target_dir, "predP_%s.csv" % test.frame_id) pojo_pred_file2 = os.path.join(target_dir, "predP_%s.csv" % test2.frame_id) print("Scoring POJO and saving to file %s" % pojo_pred_file) times = [time.time()] cp_sep = ";" if sys.platform == "win32" else ":" for inpfile, outfile in [(test_file, pojo_pred_file), (test2_file, pojo_pred_file2)]: load_csv(inpfile) java_cmd = ["java", "-cp", cp_sep.join([genmodel_jar, target_dir]), "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", "-XX:MaxPermSize=256m", "hex.genmodel.tools.PredictCsv", "--pojo", pojo_name, "--input", inpfile, "--output", outfile, "--decimal"] ret = subprocess.call(java_cmd) assert ret == 0, "GenModel finished with return code %d" % ret times.append(time.time()) print("Time taken = %.3fs (1st run: %.3f, 2nd run: %.3f)" % (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1])) report.append((estimator_name, problem, "POJO", times[1] - times[0], times[2] - times[1])) print("\nChecking whether the predictions coincide...") time0 = time.time() local_pred = load_csv(local_pred_file) server_pred = load_csv(h2o_pred_file) pojo_pred = load_csv(pojo_pred_file) if pojo_pred_file else local_pred assert len(local_pred) == len(server_pred) == len(pojo_pred) == test.nrow, \ "Number of rows in prediction files do not match: %d vs %d vs %d vs %d" % \ (len(local_pred), len(server_pred), len(pojo_pred), test.nrow) for i in range(test.nrow): lpred = local_pred[i] rpred = server_pred[i] ppred = pojo_pred[i] assert type(lpred) == type(rpred) == type(ppred), \ "Types of predictions do not match: %r / %r / %r" % (lpred, rpred, ppred) if isinstance(lpred, float): same = abs(lpred - rpred) + abs(lpred - ppred) < 1e-8 else: same = lpred == rpred == ppred assert same, \ "Predictions are different for row %d: local=%r, pojo=%r, bomo=%r" % (i + 1, lpred, ppred, rpred) print("Time taken = %.3fs" % (time.time() - time0)) print(colorama.Fore.LIGHTGREEN_EX + "\nPredictions match!\n" + colorama.Fore.RESET) print(colorama.Fore.LIGHTYELLOW_EX + "\n\n#================================================") print("# Timing report") print("#================================================\n" + colorama.Fore.RESET) print(tabulate.tabulate(report, headers=["Model", "Problem type", "Scorer", "10000 rows", "20000 rows"], floatfmt=".3f"), end="\n\n\n")
def __init__(self, nfolds=5, balance_classes=False, class_sampling_factors=None, max_after_balance_size=5.0, max_runtime_secs=None, max_runtime_secs_per_model=None, max_models=None, stopping_metric="AUTO", stopping_tolerance=None, stopping_rounds=3, seed=None, project_name=None, exclude_algos=None, include_algos=None, exploitation_ratio=-1, modeling_plan=None, preprocessing=None, monotone_constraints=None, keep_cross_validation_predictions=False, keep_cross_validation_models=False, keep_cross_validation_fold_assignment=False, sort_metric="AUTO", export_checkpoints_dir=None, verbosity="warn", **kwargs): """ Create a new H2OAutoML instance. :param int nfolds: Number of folds for k-fold cross-validation. Use ``0`` to disable cross-validation; this will also disable Stacked Ensemble (thus decreasing the overall model performance). Defaults to ``5``. :param bool balance_classes: Specify whether to oversample the minority classes to balance the class distribution. This option can increase the data frame size. This option is only applicable for classification. If the oversampled size of the dataset exceeds the maximum size calculated using the ``max_after_balance_size`` parameter, then the majority classes will be undersampled to satisfy the size limit. Defaults to ``False``. :param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires ``balance_classes`` set to ``True``. :param float max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires ``balance_classes``. Defaults to ``5.0``. :param int max_runtime_secs: Specify the maximum time that the AutoML process will run for. If neither ``max_runtime_secs`` nor ``max_models`` are specified by the user, then ``max_runtime_secs`` dynamically defaults to 3600 seconds (1 hour). Otherwise, defaults to ``0`` (no limit). :param int max_runtime_secs_per_model: Controls the max time the AutoML run will dedicate to each individual model. Defaults to ``0`` (disabled: no time limit). :param int max_models: Specify the maximum number of models to build in an AutoML run, excluding the Stacked Ensemble models. Defaults to ``None`` (disabled: no limitation). :param str stopping_metric: Specifies the metric to use for early stopping. The available options are: ``"AUTO"`` (This defaults to ``"logloss"`` for classification, ``"deviance"`` for regression), ``"deviance"``, ``"logloss"``, ``"mse"``, ``"rmse"``, ``"mae"``, ``"rmsle"``, ``"auc"``, ``aucpr``, ``"lift_top_group"``, ``"misclassification"``, ``"mean_per_class_error"``, ``"r2"``. Defaults to ``"AUTO"``. :param float stopping_tolerance: Specify the relative tolerance for the metric-based stopping criterion to stop a grid search and the training of individual models within the AutoML run. Defaults to ``0.001`` if the dataset is at least 1 million rows; otherwise it defaults to a value determined by the size of the dataset and the non-NA-rate, in which case the value is computed as 1/sqrt(nrows * non-NA-rate). :param int stopping_rounds: Stop training new models in the AutoML run when the option selected for ``stopping_metric`` doesn't improve for the specified number of models, based on a simple moving average. To disable this feature, set it to ``0``. Defaults to ``3`` and must be an non-negative integer. :param int seed: Set a seed for reproducibility. AutoML can only guarantee reproducibility if ``max_models`` or early stopping is used because ``max_runtime_secs`` is resource limited, meaning that if the resources are not the same between runs, AutoML may be able to train more models on one run vs another. In addition, H2O Deep Learning models are not reproducible by default for performance reasons, so ``exclude_algos`` must contain ``DeepLearning``. Defaults to ``None``. :param str project_name: Character string to identify an AutoML project. Defaults to ``None``, which means a project name will be auto-generated based on the training frame ID. More models can be trained on an existing AutoML project by specifying the same project name in multiple calls to the AutoML function (as long as the same training frame, or a sample, is used in subsequent runs). :param exclude_algos: List the algorithms to skip during the model-building phase. The full list of options is: - ``"DRF"`` (Random Forest and Extremely-Randomized Trees) - ``"GLM"`` - ``"XGBoost"`` - ``"GBM"`` - ``"DeepLearning"`` - ``"StackedEnsemble"`` Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional. Usage example:: exclude_algos = ["GLM", "DeepLearning", "DRF"] :param include_algos: List the algorithms to restrict to during the model-building phase. This can't be used in combination with ``exclude_algos`` param. Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional. Usage example:: include_algos = ["GLM", "DeepLearning", "DRF"] :param exploitation_ratio: The budget ratio (between 0 and 1) dedicated to the exploitation (vs exploration) phase. By default, the exploitation phase is ``0`` (disabled) as this is still experimental; to activate it, it is recommended to try a ratio around 0.1. Note that the current exploitation phase only tries to fine-tune the best XGBoost and the best GBM found during exploration. :param modeling_plan: List of modeling steps to be used by the AutoML engine (they may not all get executed, depending on other constraints). Defaults to ``None`` (Expert usage only). :param preprocessing: List of preprocessing steps to run. Only ``["target_encoding"]`` is currently supported. Experimental. :param monotone_constraints: A mapping that represents monotonic constraints. Use ``+1`` to enforce an increasing constraint and ``-1`` to specify a decreasing constraint. :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation predictions. This needs to be set to ``True`` if running the same AutoML object for repeated runs because CV predictions are required to build additional Stacked Ensemble models in AutoML. Defaults to ``False``. :param keep_cross_validation_models: Whether to keep the cross-validated models. Keeping cross-validation models may consume significantly more memory in the H2O cluster. Defaults to ``False``. :param keep_cross_validation_fold_assignment: Whether to keep fold assignments in the models. Deleting them will save memory in the H2O cluster. Defaults to ``False``. :param sort_metric: Metric to sort the leaderboard by at the end of an AutoML run. For binomial classification choose between ``"auc"``, ``"aucpr"``, ``"logloss"``, ``"mean_per_class_error"``, ``"rmse"``, ``"mse"``. For multinomial classification choose between ``"mean_per_class_error"``, ``"logloss"``, ``"rmse"``, ``"mse"``. For regression choose between ``"deviance"``, ``"rmse"``, ``"mse"``, ``"mae"``, ``"rmlse"``. Defaults to ``"AUTO"`` (This translates to ``"auc"`` for binomial classification, ``"mean_per_class_error"`` for multinomial classification, ``"deviance"`` for regression). :param export_checkpoints_dir: Path to a directory where every model will be stored in binary form. :param verbosity: Verbosity of the backend messages printed during training. Available options are ``None`` (live log disabled), ``"debug"``, ``"info"``, ``"warn"`` or ``"error"``. Defaults to ``"warn"``. """ # early validate kwargs, extracting hidden parameters: algo_parameters = {} for k in kwargs: if k == 'algo_parameters': algo_parameters = kwargs[k] or {} else: raise TypeError( "H2OAutoML got an unexpected keyword argument '%s'" % k) # Check if H2O jar contains AutoML try: h2o.api("GET /3/Metadata/schemas/AutoMLV99") except h2o.exceptions.H2OResponseError as e: print(e) print("*******************************************************************\n" \ "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \ "*******************************************************************\n" \ "\nVerbose Error Message:") self._job = None self._leader_id = None self._leaderboard = None self._verbosity = verbosity self._event_log = None self._training_info = None self._state_json = None self._build_resp = None # contains all the actual parameters used on backend self.__frozen = False self.__input = dict( ) # contains all the input params as entered by the user # Make bare minimum params containers self.build_control = dict() self.build_models = dict() self.input_spec = dict() self.project_name = project_name self.nfolds = nfolds self.balance_classes = balance_classes self.class_sampling_factors = class_sampling_factors self.max_after_balance_size = max_after_balance_size self.keep_cross_validation_models = keep_cross_validation_models self.keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment self.keep_cross_validation_predictions = keep_cross_validation_predictions self.export_checkpoints_dir = export_checkpoints_dir self.max_runtime_secs = max_runtime_secs self.max_runtime_secs_per_model = max_runtime_secs_per_model self.max_models = max_models self.stopping_metric = stopping_metric self.stopping_tolerance = stopping_tolerance self.stopping_rounds = stopping_rounds self.seed = seed self.exclude_algos = exclude_algos self.include_algos = include_algos self.exploitation_ratio = exploitation_ratio self.modeling_plan = modeling_plan self.preprocessing = preprocessing if monotone_constraints is not None: algo_parameters['monotone_constraints'] = monotone_constraints self._algo_parameters = algo_parameters self.sort_metric = sort_metric
def transform(self, frame, blending=None, inflection_point=None, smoothing=None, noise=None, as_training=False, **kwargs): """ Apply transformation to `te_columns` based on the encoding maps generated during `train()` method call. :param H2OFrame frame: the frame on which to apply the target encoding transformations. :param boolean blending: If provided, this overrides the `blending` parameter on the model. :param float inflection_point: If provided, this overrides the `inflection_point` parameter on the model. :param float smoothing: If provided, this overrides the `smoothing` parameter on the model. :param float noise: If provided, this overrides the amount of random noise added to the target encoding defined on the model, this helps prevent overfitting. :param boolean as_training: Must be set to True when encoding the training frame. Defaults to False. :example: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> predictors = ["home.dest", "cabin", "embarked"] >>> response = "survived" >>> titanic[response] = titanic[response].asfactor() >>> fold_col = "kfold_column" >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234) >>> titanic_te = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out", ... inflection_point=35, ... smoothing=25, ... blending=True, ... seed=1234) >>> titanic_te.train(x=predictors, ... y=response, ... training_frame=titanic) >>> transformed = titanic_te.transform(frame=titanic) """ for k in kwargs: if k in ['seed', 'data_leakage_handling']: warnings.warn( "`%s` is deprecated in `transform` method and will be ignored. " "Instead, please ensure that it was set before training on the H2OTargetEncoderEstimator model." % k, H2ODeprecationWarning) else: raise TypeError( "transform() got an unexpected keyword argument '%s'" % k) if 'data_leakage_handling' in kwargs: dlh = kwargs['data_leakage_handling'] assert_is_type(dlh, None, Enum("leave_one_out", "k_fold", "none")) if dlh is not None and dlh.lower() != "none": warnings.warn( "Deprecated `data_leakage_handling=%s` is replaced by `as_training=True`. " "Please update your code." % dlh, H2ODeprecationWarning) as_training = True params = dict( model=self.model_id, frame=frame.key, blending=blending if blending is not None else self. blending, # always need to provide blending here as we can't represent unset value inflection_point=inflection_point, smoothing=smoothing, noise=noise, as_training=as_training, ) output = h2o.api("GET /3/TargetEncoderTransform", data=params) return h2o.get_frame(output["name"])
def __init__(self, max_runtime_secs=3600, max_models=None, stopping_metric="AUTO", stopping_tolerance=0.001, stopping_rounds=3, seed=None, project_name=None): #Check if H2O jar contains AutoML try: h2o.api("GET /3/Metadata/schemas/AutoMLV99") except h2o.exceptions.H2OResponseError as e: print(e) print("*******************************************************************\n" \ "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \ "*******************************************************************\n" \ "\nVerbose Error Message:") #If max_runtime_secs is not provided, then it is set to default (600 secs) if max_runtime_secs is not 3600: assert_is_type(max_runtime_secs,int) max_runtime_secs = max_runtime_secs self.max_runtime_secs = max_runtime_secs else: self.max_runtime_secs = max_runtime_secs #Make bare minimum build_control self.build_control = { 'stopping_criteria': { 'max_runtime_secs': self.max_runtime_secs, } } #Add other parameters to build_control if available if max_models is not None: assert_is_type(max_models,int) self.build_control["stopping_criteria"]["max_models"] = max_models self.max_models = max_models if stopping_metric is not "AUTO": assert_is_type(stopping_metric,str) self.build_control["stopping_criteria"]["stopping_metric"] = stopping_metric self.stopping_metric = stopping_metric else: self.build_control["stopping_criteria"]["stopping_metric"] = stopping_metric self.stopping_metric = stopping_metric if stopping_tolerance is not 0.001: assert_is_type(stopping_tolerance,float) self.build_control["stopping_criteria"]["stopping_tolerance"] = stopping_tolerance self.stopping_tolerence = stopping_tolerance else: self.build_control["stopping_criteria"]["stopping_tolerance"] = stopping_tolerance self.stopping_tolerence = stopping_tolerance if stopping_rounds is not 3: assert_is_type(stopping_rounds,int) self.build_control["stopping_criteria"]["stopping_rounds"] = stopping_rounds self.stopping_rounds = stopping_rounds else: self.build_control["stopping_criteria"]["stopping_rounds"] = stopping_rounds self.stopping_rounds = stopping_rounds if seed is not None: assert_is_type(seed,int) self.build_control["stopping_criteria"]["seed"] = seed self.seed = seed #Set project name if provided. If None, then we set in .train() to "automl_" + training_frame.frame_id if project_name is not None: assert_is_type(project_name,str) self.build_control["project_name"] = project_name self.project_name = project_name else: self.project_name = None self._job = None self._automl_key = None self._leader_id = None self._leaderboard = None
def _get_params(self): res = h2o.api("GET /99/AutoML/" + self._automl_key) return res
def train(self, x = None, y = None, training_frame = None, fold_column = None, weights_column = None, validation_frame = None, leaderboard_frame=None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param validation_frame: H2OFrame with validation data to be scored on while training. :param leaderboard_frame: H2OFrame with test data to be scored on in the leaderboard. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch H2OAutoML >>> aml.train(y=y, training_frame=training_frame) """ ncols = training_frame.ncols names = training_frame.names #Minimal required arguments are training_frame and y (response) if y is None: raise ValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.') else: assert_is_type(y,int,str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) input_spec = { 'response_column': y, } if training_frame is None: raise ValueError('The training frame is not set!') else: assert_is_type(training_frame, H2OFrame) input_spec['training_frame'] = training_frame.frame_id if fold_column is not None: assert_is_type(fold_column,int,str) input_spec['fold_column'] = fold_column if weights_column is not None: assert_is_type(weights_column,int,str) input_spec['weights_column'] = weights_column if validation_frame is not None: assert_is_type(training_frame, H2OFrame) input_spec['validation_frame'] = validation_frame.frame_id if leaderboard_frame is not None: assert_is_type(training_frame, H2OFrame) input_spec['leaderboard_frame'] = leaderboard_frame.frame_id if x is not None: assert_is_type(x,list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) ignored_columns = set(names) - {y} - set(x) if fold_column is not None: ignored_columns = ignored_columns.remove(fold_column) if weights_column is not None: ignored_columns = ignored_columns.remove(weights_column) if ignored_columns is not None: input_spec['ignored_columns'] = list(ignored_columns) automl_build_params = dict(input_spec = input_spec) # NOTE: if the user hasn't specified some block of parameters don't send them! # This lets the back end use the defaults. automl_build_params['build_control'] = self.build_control resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: print("Exception from the back end: ") print(resp) return self._job = H2OJob(resp['job'], "AutoML") self._automl_key = self._job.dest_key self._job.poll() self._fetch()
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False): """ Train the H2O model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. :param bool verbose: Print scoring history to stdout. Defaults to False. """ assert_is_type(training_frame, None, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) if self._requires_training_frame() and training_frame is None: raise H2OValueError("Training frame required for %s algorithm, but none was given.", self.algo) training_frame_exists = training_frame is None if training_frame_exists: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}) if not training_frame_exists: names = training_frame.names ncols = training_frame.ncols if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if not training_frame_exists: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None: raise ValueError("Missing response") # Step 3 if not training_frame_exists: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] if not training_frame_exists: ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(verbose_model_scoring_history=verbose) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_type(y, int): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not is_type(x, list, tuple): x = [x] if is_type(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns] kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner algo = self.model._compute_algo() # unique to grid search if self.grid_id is not None: kwargs["grid_id"] = self.grid_id rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build")) if self._future: self._job = grid return grid.poll() grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key)) failure_messages_stacks = "" error_index = 0 if len(grid_json["failure_details"]) > 0: print("Errors/Warnings building gridsearch model\n") # will raise error if no grid model is returned, store error messages here for error_message in grid_json["failure_details"]: if isinstance(grid_json["failed_params"][error_index], dict): for h_name in grid_json['hyper_names']: print("Hyper-parameter: {0}, {1}".format(h_name, grid_json['failed_params'][error_index][h_name])) if len(grid_json["failure_stack_traces"]) > error_index: print("failure_details: {0}\nfailure_stack_traces: " "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index])) failure_messages_stacks += error_message+'\n' error_index += 1 self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] for model in self.models: model._estimator_type = self.model._estimator_type # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc) # sometimes no model is returned due to bad parameter values provided by the user. if len(grid_json['model_ids']) > 0: first_model_json = h2o.api("GET /%d/Models/%s" % (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0] self._resolve_grid(grid.dest_key, grid_json, first_model_json) else: if len(failure_messages_stacks)>0: raise ValueError(failure_messages_stacks) else: raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
def __init__(self, nfolds=5, balance_classes=False, class_sampling_factors=None, max_after_balance_size=5.0, max_runtime_secs=3600, max_runtime_secs_per_model=None, max_models=None, stopping_metric="AUTO", stopping_tolerance=None, stopping_rounds=3, seed=None, project_name=None, exclude_algos=None, include_algos=None, modeling_plan=None, keep_cross_validation_predictions=False, keep_cross_validation_models=False, keep_cross_validation_fold_assignment=False, sort_metric="AUTO", export_checkpoints_dir=None, verbosity="warn"): """ Create a new H2OAutoML instance. :param int nfolds: Number of folds for k-fold cross-validation. Defaults to ``5``. Use ``0`` to disable cross-validation; this will also disable Stacked Ensemble (thus decreasing the overall model performance). :param bool balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data). Defaults to ``False``. :param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires ``balance_classes``. :param float max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires ``balance_classes``. Defaults to ``5.0``. :param int max_runtime_secs: This argument controls how long the AutoML run will execute. Defaults to ``3600`` seconds (1 hour). :param int max_runtime_secs_per_model: This argument controls the max time the AutoML run will dedicate to each individual model. Defaults to `0` (disabled). :param int max_models: Specify the maximum number of models to build in an AutoML run. (Does not include the Stacked Ensemble models.) :param str stopping_metric: Specifies the metric to use for early stopping. Defaults to ``"AUTO"``. The available options are: ``"AUTO"`` (This defaults to ``"logloss"`` for classification, ``"deviance"`` for regression), ``"deviance"``, ``"logloss"``, ``"mse"``, ``"rmse"``, ``"mae"``, ``"rmsle"``, ``"auc"``, ``"lift_top_group"``, ``"misclassification"``, ``"mean_per_class_error"``, ``"r2"``. :param float stopping_tolerance: This option specifies the relative tolerance for the metric-based stopping to stop the AutoML run if the improvement is less than this value. This value defaults to ``0.001`` if the dataset is at least 1 million rows; otherwise it defaults to a value determined by the size of the dataset and the non-NA-rate. In that case, the value is computed as 1/sqrt(nrows * non-NA-rate). :param int stopping_rounds: This argument stops training new models in the AutoML run when the option selected for stopping_metric doesn't improve for the specified number of models, based on a simple moving average. To disable this feature, set it to ``0``. Defaults to ``3`` and must be an non-negative integer. :param int seed: Set a seed for reproducibility. AutoML can only guarantee reproducibility if ``max_models`` or early stopping is used because ``max_runtime_secs`` is resource limited, meaning that if the resources are not the same between runs, AutoML may be able to train more models on one run vs another. Defaults to ``None``. :param str project_name: Character string to identify an AutoML project. Defaults to ``None``, which means a project name will be auto-generated based on the training frame ID. More models can be trained on an existing AutoML project by specifying the same project name in muliple calls to the AutoML function (as long as the same training frame is used in subsequent runs). :param exclude_algos: List of character strings naming the algorithms to skip during the model-building phase. An example use is ``exclude_algos = ["GLM", "DeepLearning", "DRF"]``, and the full list of options is: ``"DRF"`` (Random Forest and Extremely-Randomized Trees), ``"GLM"``, ``"XGBoost"``, ``"GBM"``, ``"DeepLearning"`` and ``"StackedEnsemble"``. Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional. :param include_algos: List of character strings naming the algorithms to restrict to during the model-building phase. This can't be used in combination with `exclude_algos` param. Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional. :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation predictions. This needs to be set to ``True`` if running the same AutoML object for repeated runs because CV predictions are required to build additional Stacked Ensemble models in AutoML. This option defaults to ``False``. :param keep_cross_validation_models: Whether to keep the cross-validated models. Keeping cross-validation models may consume significantly more memory in the H2O cluster. Defaults to ``False``. :param keep_cross_validation_fold_assignment: Whether to keep fold assignments in the models. Deleting them will save memory in the H2O cluster. This option defaults to ``False``. :param sort_metric: Metric to sort the leaderboard by. Defaults to ``"AUTO"`` (This defaults to ``auc`` for binomial classification, ``mean_per_class_error`` for multinomial classification, ``deviance`` for regression). For binomial classification choose between ``auc``, ``"logloss"``, ``"mean_per_class_error"``, ``"rmse"``, ``"mse"``. For regression choose between ``"deviance"``, ``"rmse"``, ``"mse"``, ``"mae"``, ``"rmlse"``. For multinomial classification choose between ``"mean_per_class_error"``, ``"logloss"``, ``"rmse"``, ``"mse"``. :param export_checkpoints_dir: Path to a directory where every model will be stored in binary form. :param verbosity: Verbosity of the backend messages printed during training. Available options are None (live log disabled), 'debug', 'info' or 'warn'. Defaults to 'warn'. """ # Check if H2O jar contains AutoML try: h2o.api("GET /3/Metadata/schemas/AutoMLV99") except h2o.exceptions.H2OResponseError as e: print(e) print("*******************************************************************\n" \ "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \ "*******************************************************************\n" \ "\nVerbose Error Message:") # Make bare minimum build_control (if max_runtimes_secs is an invalid value, it will catch below) self.build_control = { 'stopping_criteria': { 'max_runtime_secs': max_runtime_secs, } } # Make bare minimum build_models self.build_models = { 'exclude_algos': None } # nfolds must be an non-negative integer and not equal to 1: if nfolds is not 5: assert_is_type(nfolds,int) assert nfolds >= 0, "nfolds set to " + str(nfolds) + "; nfolds cannot be negative. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable." assert nfolds is not 1, "nfolds set to " + str(nfolds) + "; nfolds = 1 is an invalid value. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable." self.build_control["nfolds"] = nfolds self.nfolds = nfolds # Pass through to all algorithms if balance_classes is True: self.build_control["balance_classes"] = balance_classes self.balance_classes = balance_classes if class_sampling_factors is not None: self.build_control["class_sampling_factors"] = class_sampling_factors self.class_sampling_factors = class_sampling_factors if max_after_balance_size != 5.0: assert_is_type(max_after_balance_size, float) self.build_control["max_after_balance_size"] = max_after_balance_size self.max_after_balance_size = max_after_balance_size # If max_runtime_secs is not provided, then it is set to default (3600 secs) if max_runtime_secs is not 3600: assert_is_type(max_runtime_secs, int) self.max_runtime_secs = max_runtime_secs assert_is_type(max_runtime_secs_per_model, None, int) self.max_runtime_secs_per_model = max_runtime_secs_per_model if self.max_runtime_secs_per_model is not None: self.build_control["stopping_criteria"]["max_runtime_secs_per_model"] = self.max_runtime_secs_per_model # Add other parameters to build_control if available if max_models is not None: assert_is_type(max_models, int) self.build_control["stopping_criteria"]["max_models"] = max_models self.max_models = max_models if stopping_metric is not "AUTO": assert_is_type(stopping_metric, str) self.build_control["stopping_criteria"]["stopping_metric"] = stopping_metric self.stopping_metric = stopping_metric if stopping_tolerance is not None: assert_is_type(stopping_tolerance, float) self.build_control["stopping_criteria"]["stopping_tolerance"] = stopping_tolerance self.stopping_tolerence = stopping_tolerance if stopping_rounds is not 3: assert_is_type(stopping_rounds, int) self.build_control["stopping_criteria"]["stopping_rounds"] = stopping_rounds self.stopping_rounds = stopping_rounds if seed is not None: assert_is_type(seed, int) self.build_control["stopping_criteria"]["seed"] = seed self.seed = seed # Set project name if provided. If None, then we set in .train() to "automl_" + training_frame.frame_id if project_name is not None: assert_is_type(project_name, str) check_id(project_name, "H2OAutoML") self.build_control["project_name"] = project_name self.project_name = project_name else: self.project_name = None if exclude_algos is not None: assert_is_type(exclude_algos, list) for elem in exclude_algos: assert_is_type(elem, str) self.build_models['exclude_algos'] = exclude_algos if include_algos is not None: assert exclude_algos is None, "Use either include_algos or exclude_algos, not both." assert_is_type(include_algos, list) for elem in include_algos: assert_is_type(elem, str) self.build_models['include_algos'] = include_algos if modeling_plan is not None: assert_is_type(modeling_plan, list) supported_aliases = ['all', 'defaults', 'grids'] def assert_is_step_def(sd): assert 'name' in sd, "each definition must have a 'name' key" assert 0 < len(sd) < 3, "each definition must have only 1 or 2 keys: name, name+alias or name+steps" assert len(sd) == 1 or 'alias' in sd or 'steps' in sd, "steps definitions support only the following keys: name, alias, steps" assert 'alias' not in sd or sd['alias'] in supported_aliases, "alias must be one of %s" % supported_aliases assert 'steps' not in sd or (is_type(sd['steps'], list) and all(assert_is_step(s) for s in sd['steps'])) def assert_is_step(s): assert is_type(s, dict), "each step must be a dict with an 'id' key and an optional 'weight' key" assert 'id' in s, "each step must have an 'id' key" assert len(s) == 1 or ('weight' in s and is_type(s['weight'], int)), "weight must be an integer" return True plan = [] for step_def in modeling_plan: assert_is_type(step_def, dict, tuple, str) if is_type(step_def, dict): assert_is_step_def(step_def) plan.append(step_def) elif is_type(step_def, str): plan.append(dict(name=step_def)) else: assert 0 < len(step_def) < 3 assert_is_type(step_def[0], str) name = step_def[0] if len(step_def) == 1: plan.append(dict(name=name)) else: assert_is_type(step_def[1], str, list) ids = step_def[1] if is_type(ids, str): assert_is_type(ids, *supported_aliases) plan.append(dict(name=name, alias=ids)) else: plan.append(dict(name=name, steps=[dict(id=i) for i in ids])) self.build_models['modeling_plan'] = plan assert_is_type(keep_cross_validation_predictions, bool) self.build_control["keep_cross_validation_predictions"] = keep_cross_validation_predictions assert_is_type(keep_cross_validation_models, bool) self.build_control["keep_cross_validation_models"] = keep_cross_validation_models assert_is_type(keep_cross_validation_fold_assignment, bool) self.build_control["keep_cross_validation_fold_assignment"] = self.nfolds != 0 and keep_cross_validation_fold_assignment self._job = None self._leader_id = None self._leaderboard = None self._verbosity = verbosity self._event_log = None self._training_info = None self._state_json = None if sort_metric == "AUTO": self.sort_metric = None else: self.sort_metric = sort_metric if export_checkpoints_dir is not None: assert_is_type(export_checkpoints_dir, str) self.build_control["export_checkpoints_dir"] = export_checkpoints_dir
def partial_plot(self, data, cols, destination_key=None, nbins=20, plot=True, figsize=(7,10), server=False): """ Create partial dependence plot which gives a graphical depiction of the marginal effect of a variable on the response. The effect of a variable is measured in change in the mean response. :param H2OFrame data: An H2OFrame object used for scoring and constructing the plot. :param cols: Feature(s) for which partial dependence will be calculated. :param destination_key: An key reference to the created partial dependence tables in H2O. :param nbins: Number of bins used. :param plot: A boolean specifying whether to plot partial dependence table. :param figsize: Dimension/size of the returning plots, adjust to fit your output cells. :param server: ? :return: Plot and list of calculated mean response tables for each feature requested. """ if not isinstance(data, h2o.H2OFrame): raise ValueError("data must be an instance of H2OFrame") assert_is_type(cols, [str]) assert_is_type(destination_key, None, str) assert_is_type(nbins, int) assert_is_type(plot, bool) assert_is_type(figsize, (int,int)) ## Check cols specified exist in frame data for xi in cols: if not xi in data.names: raise H2OValueError("Column %s does not exist in the training frame" % xi) kwargs = {} kwargs['cols'] = cols kwargs['model_id'] = self.model_id kwargs['frame_id'] = data.frame_id kwargs['nbins'] = nbins kwargs['destination_key'] = destination_key json = H2OJob(h2o.api("POST /3/PartialDependence/", data=kwargs), job_type="PartialDependencePlot").poll() json = h2o.api("GET /3/PartialDependence/%s" % json.dest_key) # Extract partial dependence data from json response # pps = json pps = json['partial_dependence_data'] ## Plot partial dependence plots using matplotlib if plot: plt = _get_matplotlib_pyplot(server) if not plt: return fig, axs = plt.subplots(len(cols), squeeze=False, figsize=figsize) for i, pp in enumerate(pps): ## Check weather column was categorical or numeric col=cols[i] cat=data[col].isfactor()[0] if cat: labels = pp[0] x = range(len(labels)) y = pp[1] axs[i,0].plot(x, y, 'o') axs[i,0].set_xticks(x) axs[i,0].set_xticklabels(labels) axs[i,0].margins(0.2) else: axs[i,0].plot(pp[0], pp[1]) axs[i,0].set_xlim(min(pp[0]), max(pp[0])) axs[i,0].set_title('Partial Dependence Plot For {}'.format(col)) axs[i,0].set_xlabel(pp.col_header[0]) axs[i,0].set_ylabel(pp.col_header[1]) axs[i,0].xaxis.grid() axs[i,0].yaxis.grid() if len(col) >1: fig.tight_layout(pad = 0.4,w_pad=0.5, h_pad=1.0) return pps
def show_status(self, detailed=False): """ Print current cluster status information. :param detailed: if True, then also print detailed information about each node. """ if self._retrieved_at + self.REFRESH_INTERVAL < time.time(): # Info is stale, need to refresh new_info = h2o.api("GET /3/Cloud") self._fill_from_h2ocluster(new_info) ncpus = sum(node["num_cpus"] for node in self.nodes) allowed_cpus = sum(node["cpus_allowed"] for node in self.nodes) free_mem = sum(node["free_mem"] for node in self.nodes) unhealthy_nodes = sum(not node["healthy"] for node in self.nodes) status = "locked" if self.locked else "accepting new members" if unhealthy_nodes == 0: status += ", healthy" else: status += ", %d nodes are not healthy" % unhealthy_nodes H2ODisplay([ [ "H2O cluster uptime:", get_human_readable_time(self.cloud_uptime_millis) ], ["H2O cluster version:", self.version], [ "H2O cluster version age:", "{} {}".format(self.build_age, ("!!!" if self.build_too_old else "")) ], ["H2O cluster name:", self.cloud_name], ["H2O cluster total nodes:", self.cloud_size], ["H2O cluster free memory:", get_human_readable_bytes(free_mem)], ["H2O cluster total cores:", str(ncpus)], ["H2O cluster allowed cores:", str(allowed_cpus)], ["H2O cluster status:", status], ["H2O connection url:", h2o.connection().base_url], ["H2O connection proxy:", h2o.connection().proxy], ["H2O internal security:", self.internal_security_enabled], ["Python version:", "%d.%d.%d %s" % tuple(sys.version_info[:4])], ]) if detailed: keys = [ "h2o", "healthy", "last_ping", "num_cpus", "sys_load", "mem_value_size", "free_mem", "pojo_mem", "swap_mem", "free_disk", "max_disk", "pid", "num_keys", "tcps_active", "open_fds", "rpcs_active" ] header = ["Nodes info:"] + [ "Node %d" % (i + 1) for i in range(len(self.nodes)) ] table = [[k] for k in keys] for node in self.nodes: for i, k in enumerate(keys): table[i].append(node[k]) H2ODisplay(table=table, header=header)
def cancel(self): h2o.api("POST /3/Jobs/%s/cancel" % self.job_key) self.status = "CANCELLED"
def __init__(self, nfolds=5, balance_classes=False, class_sampling_factors=None, max_after_balance_size=5.0, max_runtime_secs=None, max_runtime_secs_per_model=None, max_models=None, stopping_metric="AUTO", stopping_tolerance=None, stopping_rounds=3, seed=None, project_name=None, exclude_algos=None, include_algos=None, exploitation_ratio=0, modeling_plan=None, monotone_constraints=None, algo_parameters=None, keep_cross_validation_predictions=False, keep_cross_validation_models=False, keep_cross_validation_fold_assignment=False, sort_metric="AUTO", export_checkpoints_dir=None, verbosity="warn"): """ Create a new H2OAutoML instance. :param int nfolds: Number of folds for k-fold cross-validation. Defaults to ``5``. Use ``0`` to disable cross-validation; this will also disable Stacked Ensemble (thus decreasing the overall model performance). :param bool balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data). Defaults to ``False``. :param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires ``balance_classes``. :param float max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be less than 1.0). Requires ``balance_classes``. Defaults to ``5.0``. :param int max_runtime_secs: This argument specifies the maximum time that the AutoML process will run for, prior to training the final Stacked Ensemble models. If neither ``max_runtime_secs`` nor ``max_models`` are specified by the user, then ``max_runtime_secs`` defaults to 3600 seconds (1 hour). :param int max_runtime_secs_per_model: This argument controls the max time the AutoML run will dedicate to each individual model. Defaults to `0` (disabled). :param int max_models: Specify the maximum number of models to build in an AutoML run. Not limited by default. (Does not include the Stacked Ensemble models.) :param str stopping_metric: Specifies the metric to use for early stopping. Defaults to ``"AUTO"``. The available options are: ``"AUTO"`` (This defaults to ``"logloss"`` for classification, ``"deviance"`` for regression), ``"deviance"``, ``"logloss"``, ``"mse"``, ``"rmse"``, ``"mae"``, ``"rmsle"``, ``"auc"``, ``aucpr``, ``"lift_top_group"``, ``"misclassification"``, ``"mean_per_class_error"``, ``"r2"``. :param float stopping_tolerance: This option specifies the relative tolerance for the metric-based stopping to stop the AutoML run if the improvement is less than this value. This value defaults to ``0.001`` if the dataset is at least 1 million rows; otherwise it defaults to a value determined by the size of the dataset and the non-NA-rate. In that case, the value is computed as 1/sqrt(nrows * non-NA-rate). :param int stopping_rounds: This argument stops training new models in the AutoML run when the option selected for stopping_metric doesn't improve for the specified number of models, based on a simple moving average. To disable this feature, set it to ``0``. Defaults to ``3`` and must be an non-negative integer. :param int seed: Set a seed for reproducibility. AutoML can only guarantee reproducibility if ``max_models`` or early stopping is used because ``max_runtime_secs`` is resource limited, meaning that if the resources are not the same between runs, AutoML may be able to train more models on one run vs another. Defaults to ``None``. :param str project_name: Character string to identify an AutoML project. Defaults to ``None``, which means a project name will be auto-generated based on the training frame ID. More models can be trained on an existing AutoML project by specifying the same project name in muliple calls to the AutoML function (as long as the same training frame is used in subsequent runs). :param exclude_algos: List of character strings naming the algorithms to skip during the model-building phase. An example use is ``exclude_algos = ["GLM", "DeepLearning", "DRF"]``, and the full list of options is: ``"DRF"`` (Random Forest and Extremely-Randomized Trees), ``"GLM"``, ``"XGBoost"``, ``"GBM"``, ``"DeepLearning"`` and ``"StackedEnsemble"``. Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional. :param include_algos: List of character strings naming the algorithms to restrict to during the model-building phase. This can't be used in combination with `exclude_algos` param. Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional. :param exploitation_ratio: The budget ratio (between 0 and 1) dedicated to the exploitation (vs exploration) phase. By default, the exploitation phase is disabled (exploitation_ratio=0) as this is still experimental; to activate it, it is recommended to try a ratio around 0.1. Note that the current exploitation phase only tries to fine-tune the best XGBoost and the best GBM found during exploration. :param modeling_plan: List of modeling steps to be used by the AutoML engine (they may not all get executed, depending on other constraints). Defaults to None (Expert usage only). :param monotone_constraints: Dict representing monotonic constraints. Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint. :param algo_parameters: Dict of ``param_name=param_value`` to be passed to internal models. Defaults to none (Expert usage only). By default, params are set only to algorithms accepting them, and ignored by others. Only following parameters are currently allowed: ``"monotone_constraints"``. :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation predictions. This needs to be set to ``True`` if running the same AutoML object for repeated runs because CV predictions are required to build additional Stacked Ensemble models in AutoML. This option defaults to ``False``. :param keep_cross_validation_models: Whether to keep the cross-validated models. Keeping cross-validation models may consume significantly more memory in the H2O cluster. Defaults to ``False``. :param keep_cross_validation_fold_assignment: Whether to keep fold assignments in the models. Deleting them will save memory in the H2O cluster. This option defaults to ``False``. :param sort_metric: Metric to sort the leaderboard by. Defaults to ``"AUTO"`` (This defaults to ``auc`` for binomial classification, ``mean_per_class_error`` for multinomial classification, ``deviance`` for regression). For binomial classification choose between ``auc``, ``aucpr``, ``"logloss"``, ``"mean_per_class_error"``, ``"rmse"``, ``"mse"``. For regression choose between ``"deviance"``, ``"rmse"``, ``"mse"``, ``"mae"``, ``"rmlse"``. For multinomial classification choose between ``"mean_per_class_error"``, ``"logloss"``, ``"rmse"``, ``"mse"``. :param export_checkpoints_dir: Path to a directory where every model will be stored in binary form. :param verbosity: Verbosity of the backend messages printed during training. Available options are None (live log disabled), 'debug', 'info' or 'warn'. Defaults to 'warn'. """ # Check if H2O jar contains AutoML try: h2o.api("GET /3/Metadata/schemas/AutoMLV99") except h2o.exceptions.H2OResponseError as e: print(e) print("*******************************************************************\n" \ "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \ "*******************************************************************\n" \ "\nVerbose Error Message:") self._job = None self._leader_id = None self._leaderboard = None self._verbosity = verbosity self._event_log = None self._training_info = None self._state_json = None self._build_resp = None # contains all the actual parameters used on backend # Make bare minimum params containers self.build_control = dict(stopping_criteria=dict()) self.build_models = dict() self.input_spec = dict() # build_control params # assert_is_type(project_name, None, str) check_id(project_name, "H2OAutoML") self._project_name = self.build_control["project_name"] = project_name assert_is_type(nfolds, int) assert nfolds >= 0, "nfolds set to " + str( nfolds ) + "; nfolds cannot be negative. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable." assert nfolds is not 1, "nfolds set to " + str( nfolds ) + "; nfolds = 1 is an invalid value. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable." self.nfolds = self.build_control["nfolds"] = nfolds assert_is_type(balance_classes, bool) self.balance_classes = self.build_control[ "balance_classes"] = balance_classes assert_is_type(class_sampling_factors, None, [numeric]) self.class_sampling_factors = self.build_control[ "class_sampling_factors"] = class_sampling_factors assert_is_type(max_after_balance_size, None, numeric) self.max_after_balance_size = self.build_control[ "max_after_balance_size"] = max_after_balance_size assert_is_type(keep_cross_validation_models, bool) self.keep_cross_validation_models = self.build_control[ "keep_cross_validation_models"] = keep_cross_validation_models assert_is_type(keep_cross_validation_fold_assignment, bool) self.keep_cross_validation_fold_assignment = self.build_control[ "keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment assert_is_type(keep_cross_validation_predictions, bool) self.keep_cross_validation_predictions = self.build_control[ "keep_cross_validation_predictions"] = keep_cross_validation_predictions assert_is_type(export_checkpoints_dir, None, str) self.export_checkpoints_dir = self.build_control[ "export_checkpoints_dir"] = export_checkpoints_dir # stopping criteria params # assert_is_type(max_runtime_secs, None, int) self.max_runtime_secs = self.build_control['stopping_criteria'][ 'max_runtime_secs'] = max_runtime_secs assert_is_type(max_runtime_secs_per_model, None, int) self.max_runtime_secs_per_model = self.build_control[ "stopping_criteria"][ "max_runtime_secs_per_model"] = max_runtime_secs_per_model assert_is_type(max_models, None, int) self.max_models = self.build_control["stopping_criteria"][ "max_models"] = max_models assert_is_type(stopping_metric, None, str) self.stopping_metric = self.build_control["stopping_criteria"][ "stopping_metric"] = stopping_metric assert_is_type(stopping_tolerance, None, numeric) self.stopping_tolerance = self.build_control["stopping_criteria"][ "stopping_tolerance"] = stopping_tolerance assert_is_type(stopping_rounds, None, int) self.stopping_rounds = self.build_control["stopping_criteria"][ "stopping_rounds"] = stopping_rounds assert_is_type(seed, None, int) self.seed = self.build_control["stopping_criteria"]["seed"] = seed # build models params # assert_is_type(exclude_algos, None, [str]) self.exclude_algos = self.build_models['exclude_algos'] = exclude_algos assert_is_type(include_algos, None, [str]) if include_algos is not None: assert exclude_algos is None, "Use either include_algos or exclude_algos, not both." self.include_algos = self.build_models['include_algos'] = include_algos assert_is_type(exploitation_ratio, None, numeric) self.exploitation_ratio = self.build_models[ 'exploitation_ratio'] = exploitation_ratio assert_is_type(modeling_plan, None, list) if modeling_plan is not None: supported_aliases = ['all', 'defaults', 'grids'] def assert_is_step_def(sd): assert 'name' in sd, "each definition must have a 'name' key" assert 0 < len( sd ) < 3, "each definition must have only 1 or 2 keys: name, name+alias or name+steps" assert len( sd ) == 1 or 'alias' in sd or 'steps' in sd, "steps definitions support only the following keys: name, alias, steps" assert 'alias' not in sd or sd[ 'alias'] in supported_aliases, "alias must be one of %s" % supported_aliases assert 'steps' not in sd or (is_type(sd['steps'], list) and all( assert_is_step(s) for s in sd['steps'])) def assert_is_step(s): assert is_type( s, dict ), "each step must be a dict with an 'id' key and an optional 'weight' key" assert 'id' in s, "each step must have an 'id' key" assert len(s) == 1 or ('weight' in s and is_type( s['weight'], int)), "weight must be an integer" return True plan = [] for step_def in modeling_plan: assert_is_type(step_def, dict, tuple, str) if is_type(step_def, dict): assert_is_step_def(step_def) plan.append(step_def) elif is_type(step_def, str): plan.append(dict(name=step_def)) else: assert 0 < len(step_def) < 3 assert_is_type(step_def[0], str) name = step_def[0] if len(step_def) == 1: plan.append(dict(name=name)) else: assert_is_type(step_def[1], str, list) ids = step_def[1] if is_type(ids, str): assert_is_type(ids, *supported_aliases) plan.append(dict(name=name, alias=ids)) else: plan.append( dict(name=name, steps=[dict(id=i) for i in ids])) self.modeling_plan = self.build_models['modeling_plan'] = plan else: self.modeling_plan = None assert_is_type(algo_parameters, None, dict) if monotone_constraints is not None: if algo_parameters is None: algo_parameters = {} self.monotone_constraints = algo_parameters[ 'monotone_constraints'] = monotone_constraints else: self.monotone_constraints = None assert_is_type(algo_parameters, None, dict) if algo_parameters is not None: algo_parameters_json = [] for k, v in algo_parameters.items(): scope, __, name = k.partition('__') if len(name) == 0: name, scope = scope, 'any' value = [ dict(key=k, value=v) for k, v in v.items() ] if isinstance( v, dict ) else v # we can't use stringify_dict here as this will be converted into a JSON string algo_parameters_json.append( dict(scope=scope, name=name, value=value)) self.algo_parameters = self.build_models[ 'algo_parameters'] = algo_parameters_json else: self.algo_parameters = None # input spec params # assert_is_type(sort_metric, None, str) self.sort_metric = self.input_spec['sort_metric'] = sort_metric
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None): """ Train the H2O model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. """ assert_is_type(training_frame, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) algo = self.algo parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm", "word2vec"}) ncols = training_frame.ncols names = training_frame.names if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" elif y is not None: raise H2OValueError("y should not be provided for an unsupervised model") assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"pca", "svd", "kmeans", "glrm", "word2vec"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None: raise ValueError("Missing response") # Step 3 parms["training_frame"] = training_frame if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms), job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll() model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def train_segments(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, segments=None, segment_models_id=None, parallelism=1, verbose=False): """ Trains H2O model for each segment (subpopulation) of the training dataset. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for each model training. Use 0 to disable. Please note that regardless of how this parameter is set, a model will be built for each input segment. This parameter only affects individual model training. :param segments: A list of columns to segment-by. H2O will group the training (and validation) dataset by the segment-by columns and train a separate model for each segment (group of rows). As an alternative to providing a list of columns, users can also supply an explicit enumeration of segments to build the models for. This enumeration needs to be represented as H2OFrame. :param segment_models_id: Identifier for the returned collection of Segment Models. If not specified it will be automatically generated. :param parallelism: Level of parallelism of the bulk segment models building, it is the maximum number of models each H2O node will be building in parallel. :param bool verbose: Enable to print additional information during model building. Defaults to False. :examples: >>> response = "survived" >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> titanic[response] = titanic[response].asfactor() >>> predictors = ["survived","name","sex","age","sibsp","parch","ticket","fare","cabin"] >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234) >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator >>> titanic_gbm = H2OGradientBoostingEstimator(seed=1234) >>> titanic_models = titanic_gbm.train_segments(segments=["pclass"], ... x=predictors, ... y=response, ... training_frame=train, ... validation_frame=valid) >>> titanic_models.as_frame() """ assert_is_type(segments, None, H2OFrame, [str]) assert_is_type(verbose, bool) assert_is_type(segment_models_id, None, str) assert_is_type(parallelism, int) if segments is None: raise H2OValueError("Parameter segments was not specified. Please provide either a list of columns to " "segment-by or an explicit list of segments to build models for.") parms = self._make_parms(x=x, y=y, training_frame=training_frame, offset_column=offset_column, fold_column=fold_column, weights_column=weights_column, validation_frame=validation_frame, max_runtime_secs=max_runtime_secs, ignored_columns=ignored_columns, model_id=None, verbose=verbose) if isinstance(segments, H2OFrame): parms["segments"] = H2OEstimator._keyify(segments) else: parms["segment_columns"] = segments if segment_models_id: parms["segment_models_id"] = segment_models_id parms["parallelism"] = parallelism rest_ver = self._get_rest_version(parms) train_segments_response = h2o.api("POST /%d/SegmentModelsBuilders/%s" % (rest_ver, self.algo), data=parms) job = H2OJob(train_segments_response, job_type=(self.algo + " Segment Models Build")) job.poll() return H2OSegmentModels(job.dest_key)
def __init__(self, nfolds=5, max_runtime_secs=3600, max_models=None, stopping_metric="AUTO", stopping_tolerance=None, stopping_rounds=3, seed=None, project_name=None): # Check if H2O jar contains AutoML try: h2o.api("GET /3/Metadata/schemas/AutoMLV99") except h2o.exceptions.H2OResponseError as e: print(e) print("*******************************************************************\n" \ "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \ "*******************************************************************\n" \ "\nVerbose Error Message:") # Make bare minimum build_control (if max_runtimes_secs is an invalid value, it will catch below) self.build_control = { 'stopping_criteria': { 'max_runtime_secs': max_runtime_secs, } } # nfolds must be an non-negative integer and not equal to 1: if nfolds is not 5: assert_is_type(nfolds,int) assert nfolds >= 0, "nfolds set to " + str(nfolds) + "; nfolds cannot be negative. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable." assert nfolds is not 1, "nfolds set to " + str(nfolds) + "; nfolds = 1 is an invalid value. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable." self.build_control["nfolds"] = nfolds self.nfolds = nfolds # If max_runtime_secs is not provided, then it is set to default (3600 secs) if max_runtime_secs is not 3600: assert_is_type(max_runtime_secs,int) self.max_runtime_secs = max_runtime_secs # Add other parameters to build_control if available if max_models is not None: assert_is_type(max_models,int) self.build_control["stopping_criteria"]["max_models"] = max_models self.max_models = max_models if stopping_metric is not "AUTO": assert_is_type(stopping_metric,str) self.build_control["stopping_criteria"]["stopping_metric"] = stopping_metric self.stopping_metric = stopping_metric if stopping_tolerance is not None: assert_is_type(stopping_tolerance,float) self.build_control["stopping_criteria"]["stopping_tolerance"] = stopping_tolerance self.stopping_tolerence = stopping_tolerance if stopping_rounds is not 3: assert_is_type(stopping_rounds,int) self.build_control["stopping_criteria"]["stopping_rounds"] = stopping_rounds self.stopping_rounds = stopping_rounds if seed is not None: assert_is_type(seed,int) self.build_control["stopping_criteria"]["seed"] = seed self.seed = seed # Set project name if provided. If None, then we set in .train() to "automl_" + training_frame.frame_id if project_name is not None: assert_is_type(project_name,str) self.build_control["project_name"] = project_name self.project_name = project_name else: self.project_name = None self._job = None self._automl_key = None self._leader_id = None self._leaderboard = None
def train(self, x=None, y=None, training_frame=None, fold_column=None, weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None): """ Begins an AutoML task, a background task that automatically builds a number of models with various algorithms and tracks their performance in a leaderboard. At any point in the process you may use H2O's performance or prediction functions on the resulting models. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param fold_column: The name or index of the column in training_frame that holds per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds per-row weights. :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold_column or weights_column). :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used for early stopping of individual models and early stopping of the grid searches. By default and when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored. :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard. This is optional and if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard rankings instead. :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values). This is optional, but when provided, it is also recommended to disable cross validation by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes. :returns: An H2OAutoML object. :examples: >>> # Set up an H2OAutoML object >>> aml = H2OAutoML(max_runtime_secs=30) >>> # Launch an AutoML run >>> aml.train(y=y, training_frame=train) """ # Minimal required arguments are training_frame and y (response) self.training_frame = training_frame ncols = self.training_frame.ncols names = self.training_frame.names if y is None and self.response_column is None: raise H2OValueError( 'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.' ) elif y is not None: assert_is_type(y, int, str) if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError( "Column %s does not exist in the training frame" % y) self.response_column = y self.fold_column = fold_column self.weights_column = weights_column self.validation_frame = validation_frame self.leaderboard_frame = leaderboard_frame self.blending_frame = blending_frame if x is not None: assert_is_type(x, list) xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError( "Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError( "Column %s not in the training frame" % xi) xset.add(xi) ignored_columns = set(names) - xset for col in [y, fold_column, weights_column]: if col is not None and col in ignored_columns: ignored_columns.remove(col) if ignored_columns is not None: self.input_spec['ignored_columns'] = list(ignored_columns) def clean_params(params): return ({ k: clean_params(v) for k, v in params.items() if v is not None } if isinstance(params, dict) else H2OEstimator._keyify(params)) automl_build_params = clean_params( dict( build_control=self.build_control, build_models=self.build_models, input_spec=self.input_spec, )) resp = self._build_resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params) if 'job' not in resp: raise H2OResponseError( "Backend failed to build the AutoML job: {}".format(resp)) if not self.project_name: self.project_name = resp['build_control']['project_name'] self.__frozen = True self._job = H2OJob(resp['job'], "AutoML") poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={}) try: self._job.poll(poll_updates=poll_updates) finally: poll_updates(self._job, 1) self._fetch() return self.leader
def _get_params(self): res = h2o.api("GET /99/AutoML/" + self.project_name) return res