Esempio n. 1
0
def set_s3_credentials(secret_key_id, secret_access_key):
    """Creates a new Amazon S3 client internally with specified credentials.
    There are no validations done to the credentials. Incorrect credentials are thus revealed with first S3 import call.
    
    secretKeyId Amazon S3 Secret Key ID (provided by Amazon)
    secretAccessKey Amazon S3 Secret Access Key (provided by Amazon)
    """
    if(secret_key_id is None):
        raise H2OValueError("Secret key ID must be specified")

    if(secret_access_key is None):
        raise H2OValueError("Secret access key must be specified")
    
    if(not secret_key_id):
        raise H2OValueError("Secret key ID must not be empty")
    
    if(not secret_access_key):
        raise H2OValueError("Secret access key must not be empty")
    
    
    params = {"secret_key_id": secret_key_id,
              "secret_access_key": secret_access_key
              }
    
    h2o.api(endpoint="POST /3/PersistS3", data=params)
    print("Credentials successfully set.")
Esempio n. 2
0
File: job.py Progetto: h2oai/h2o-3
    def poll(self):
        """
        Wait until the job finishes.

        This method will continuously query the server about the status of the job, until the job reaches a
        completion. During this time we will display (in stdout) a progress bar with % completion status.
        """
        try:
            hidden = not H2OJob.__PROGRESS_BAR__
            pb = ProgressBar(title=self._job_type + " progress", hidden=hidden)
            pb.execute(self._refresh_job_status)
        except StopIteration as e:
            if str(e) == "cancelled":
                h2o.api("POST /3/Jobs/%s/cancel" % self.job_key)
                self.status = "CANCELLED"
            # Potentially we may want to re-raise the exception here

        assert self.status in {"DONE", "CANCELLED", "FAILED"} or self._poll_count <= 0, \
            "Polling finished while the job has status %s" % self.status
        if self.warnings:
            for w in self.warnings:
                warnings.warn(w)

        # check if failed... and politely print relevant message
        if self.status == "CANCELLED":
            raise H2OJobCancelled("Job<%s> was cancelled by the user." % self.job_key)
        if self.status == "FAILED":
            if (isinstance(self.job, dict)) and ("stacktrace" in list(self.job)):
                raise EnvironmentError("Job with key {} failed with an exception: {}\nstacktrace: "
                                       "\n{}".format(self.job_key, self.exception, self.job["stacktrace"]))
            else:
                raise EnvironmentError("Job with key %s failed with an exception: %s" % (self.job_key, self.exception))

        return self
Esempio n. 3
0
    def poll(self):
        """
        Wait until the job finishes.

        This method will continuously query the server about the status of the job, until the job reaches a
        completion. During this time we will display (in stdout) a progress bar with % completion status.
        """
        try:
            pb = ProgressBar(self._job_type + " progress")
            pb.execute(self._refresh_job_status)
        except StopIteration as e:
            if str(e) == "cancelled":
                self.status = "CANCELLED"
                h2o.api("POST /3/Jobs/%s/cancel" % self.job_key)
                print("Job {} was cancelled.".format(self.job_key))
            # Potentially we may want to re-raise the exception here

        if self.warnings:
            for w in self.warnings:
                warnings.warn(w)
        # TODO: this needs to br thought through more carefully
        # check if failed... and politely print relevant message
        if self.status == "CANCELLED":
            raise EnvironmentError("Job with key {} was cancelled by the user.".format(self.job_key))
        if self.status == "FAILED":
            if (isinstance(self.job, dict)) and ("stacktrace" in list(self.job)):
                raise EnvironmentError("Job with key {} failed with an exception: {}\nstacktrace: "
                                       "\n{}".format(self.job_key, self.exception, self.job["stacktrace"]))
            else:
                raise EnvironmentError("Job with key %s failed with an exception: %s" % (self.job_key, self.exception))

        return self
Esempio n. 4
0
 def signal_handler(self, signum, stackframe):
     """(internal)."""
     if self._polling:
         h2o.api("POST /3/Jobs/%s/cancel" % self.job_key)
         print("Job {} was cancelled.".format(self.job_key))
     else:
         signal.default_int_handler()
Esempio n. 5
0
    def get_grid(self, sort_by=None, decreasing=None):
        """
        Retrieve an H2OGridSearch instance. Optionally specify a metric by which to sort models and a sort order.

        Parameters
        ----------
        sort_by : str, optional
          A metric by which to sort the models in the grid space. Choices are "logloss", "residual_deviance", "mse",
          "auc", "r2", "accuracy", "precision", "recall", "f1", etc.
        decreasing : bool, optional
          Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default).
        Returns
        -------
          A new H2OGridSearch instance optionally sorted on the specified metric.

        """
        if sort_by is None and decreasing is None: return self

        grid_json = h2o.api("GET /99/Grids/%s" % self._id, data={"sort_by": sort_by, "decreasing": decreasing})
        grid = H2OGridSearch(self.model, self.hyper_params, self._id)
        grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]  # reordered
        first_model_json = h2o.api("GET /99/Models/%s" % grid_json['model_ids'][0]['name'])['models'][0]
        model_class = H2OGridSearch._metrics_class(first_model_json)
        m = model_class()
        m._id = self._id
        m._grid_json = grid_json
        # m._metrics_class = metrics_class
        m._parms = grid._parms
        H2OEstimator.mixin(grid, model_class)
        grid.__dict__.update(m.__dict__.copy())
        return grid
Esempio n. 6
0
    def get_grid(self, sort_by=None, decreasing=None):
        """
        Retrieve an H2OGridSearch instance.

        Optionally specify a metric by which to sort models and a sort order.
        Note that if neither cross-validation nor a validation frame is used in the grid search, then the
        training metrics will display in the "get grid" output. If a validation frame is passed to the grid, and
        ``nfolds = 0``, then the validation metrics will display. However, if ``nfolds`` > 1, then cross-validation
        metrics will display even if a validation frame is provided.

        :param str sort_by: A metric by which to sort the models in the grid space. Choices are: ``"logloss"``,
            ``"residual_deviance"``, ``"mse"``, ``"auc"``, ``"r2"``, ``"accuracy"``, ``"precision"``, ``"recall"``,
            ``"f1"``, etc.
        :param bool decreasing: Sort the models in decreasing order of metric if true, otherwise sort in increasing
            order (default).

        :returns: A new H2OGridSearch instance optionally sorted on the specified metric.
        """
        if sort_by is None and decreasing is None: return self

        grid_json = h2o.api("GET /99/Grids/%s" % self._id, data={"sort_by": sort_by, "decreasing": decreasing})
        grid = H2OGridSearch(self.model, self.hyper_params, self._id)
        grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]  # reordered
        first_model_json = h2o.api("GET /99/Models/%s" % grid_json['model_ids'][0]['name'])['models'][0]
        model_class = H2OGridSearch._metrics_class(first_model_json)
        m = model_class()
        m._id = self._id
        m._grid_json = grid_json
        # m._metrics_class = metrics_class
        m._parms = grid._parms
        H2OEstimator.mixin(grid, model_class)
        grid.__dict__.update(m.__dict__.copy())
        return grid
Esempio n. 7
0
  def _model_build(self, x, y, tframe, vframe, kwargs):
    kwargs['training_frame'] = tframe
    if vframe is not None: kwargs["validation_frame"] = vframe
    if is_int(y): y = tframe.names[y]
    if y is not None: kwargs['response_column'] = y
    if not isinstance(x, (list,tuple)): x=[x]
    if is_int(x[0]):
      x = [tframe.names[i] for i in x]
    offset = kwargs["offset_column"]
    folds  = kwargs["fold_column"]
    weights= kwargs["weights_column"]
    ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights]))
    kwargs["ignored_columns"] = None if ignored_columns==[] else [quoted(col) for col in ignored_columns]
    kwargs["interactions"] = None if ("interactions" not in kwargs or kwargs["interactions"] is None) else [quoted(col) for col in kwargs["interactions"]]
    kwargs = dict([(k, H2OEstimator._keyify_if_H2OFrame(kwargs[k])) for k in kwargs])  # gruesome one-liner
    rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else 3
    algo = self._compute_algo()

    model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, algo), data=kwargs),
                   job_type=(algo + " Model Build"))

    if self._future:
      self._job = model
      return

    model.poll()
    model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
    self._resolve_model(model.dest_key, model_json)
Esempio n. 8
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_type(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not is_type(x, list, tuple): x = [x]
        if is_type(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if
                       kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  # unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
        rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None

        grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()

        grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key))
        failure_messages_stacks = ""
        error_index = 0
        if len(grid_json["failure_details"]) > 0:
            print("Errors/Warnings building gridsearch model\n")
# will raise error if no grid model is returned, store error messages here

            for error_message in grid_json["failure_details"]:
                if isinstance(grid_json["failed_params"][error_index], dict):
                    for h_name in grid_json['hyper_names']:
                        print("Hyper-parameter: {0}, {1}".format(h_name,
                                                                 grid_json['failed_params'][error_index][h_name]))

                if len(grid_json["failure_stack_traces"]) > error_index:
                    print("failure_details: {0}\nfailure_stack_traces: "
                          "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index]))
                    failure_messages_stacks += error_message+'\n'
                error_index += 1

        self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]

        # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        # sometimes no model is returned due to bad parameter values provided by the user.
        if len(grid_json['model_ids']) > 0:
            first_model_json = h2o.api("GET /%d/Models/%s" %
                                       (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0]
            self._resolve_grid(grid.dest_key, grid_json, first_model_json)
        else:
            if len(failure_messages_stacks)>0:
                raise ValueError(failure_messages_stacks)
            else:
                raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
Esempio n. 9
0
    def is_running(self):
        """
        Determine if the H2O cluster is running or not.

        :returns: True if the cluster is up; False otherwise
        """
        try:
            if h2o.connection().local_server and not h2o.connection().local_server.is_running(): return False
            h2o.api("GET /")
            return True
        except (H2OConnectionError, H2OServerError):
            return False
Esempio n. 10
0
def test_na_omits():
    hf = h2o.H2OFrame({'A': [1, 'NA', 2], 'B': [1, 2, 3], 'C': [4, 5, 6]})
    hf.summary()
    hf_col_summary = h2o.api("GET /3/Frames/%s/summary" % urllib.parse.quote(hf.frame_id))["frames"][0]["columns"]
    hf_col_summary = sum([e["missing_count"] for e in hf_col_summary])
    assert hf_col_summary == 1  # we have one NAN here

    # attempt to remove it by calling na_omit
    hf_naomit = hf.na_omit()
    hf_naomit.summary()
    hf_naomit_col_summary = \
        h2o.api("GET /3/Frames/%s/summary" % urllib.parse.quote(hf_naomit.frame_id))["frames"][0]["columns"]
    hf_naomit_col_summary = sum([e["missing_count"] for e in hf_naomit_col_summary])
    assert hf_naomit_col_summary == 0  # we have removed the NAN row
Esempio n. 11
0
 def _refresh_job_status(self):
     jobs = h2o.api("GET /3/Jobs/%s" % self.job_key)
     self.job = jobs["jobs"][0] if "jobs" in jobs else jobs["job"][0]
     self.status = self.job["status"]
     self.progress = min(self.job["progress"], 1)
     self.exception = self.job["exception"]
     self.warnings = self.job["warnings"] if "warnings" in self.job else None
def test_api_timestamp():
    prostate_train = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv"))

    prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

    ntrees = 1
    learning_rate = 0.1
    depth = 5
    min_rows = 10

    # Build H2O GBM classification model:
    gbm_h2o = H2OGradientBoostingEstimator(ntrees=ntrees, learn_rate=learning_rate,
                                           max_depth=depth,
                                           min_rows=min_rows,
                                           distribution="bernoulli",
                                           model_id="test_timestamp")
    gbm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)

    model = h2o.get_model(model_id="test_timestamp")
    models = h2o.api("GET /3/Models")
    assert model._model_json['timestamp'] == models["models"][0]["timestamp"], "Timestamp should be the same."

    assert gbm_h2o.start_time is not None and gbm_h2o.start_time > 0
    assert gbm_h2o.end_time is not None and gbm_h2o.end_time > 0
    assert gbm_h2o.run_time is not None and gbm_h2o.run_time > 0

    assert gbm_h2o.end_time - gbm_h2o.start_time == gbm_h2o.run_time
Esempio n. 13
0
def _resolve_model(future_model, **kwargs):
    future_model.poll()
    rest_ver = kwargs["_rest_version"] if "_rest_version" in kwargs else 3
    model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, future_model.job.dest_key))["models"][0]

    model_type = model_json["output"]["model_category"]
    if model_type == "Binomial":
        model = H2OBinomialModel(future_model.job.dest_key, model_json)
    elif model_type == "Clustering":
        model = H2OClusteringModel(future_model.job.dest_key, model_json)
    elif model_type == "Regression":
        model = H2ORegressionModel(future_model.job.dest_key, model_json)
    elif model_type == "Multinomial":
        model = H2OMultinomialModel(future_model.job.dest_key, model_json)
    elif model_type == "Ordinal":
        model = H2OOrdinalModel(future_model.job.dest_key, model_json)
    elif model_type == "AutoEncoder":
        model = H2OAutoEncoderModel(future_model.job.dest_key, model_json)
    elif model_type == "DimReduction":
        model = H2ODimReductionModel(future_model.job.dest_key, model_json)
    elif model_type == "WordEmbedding":
        model = H2OWordEmbeddingModel(future_model.job.dest_key, model_json)
    else:
        raise NotImplementedError(model_type)
    return model
Esempio n. 14
0
def get_automl(project_name):
    """
    Retrieve information about an AutoML instance.

    :param str project_name:  A string indicating the project_name of the automl instance to retrieve.
    :returns: A dictionary containing the project_name, leader model, and leaderboard.
    """
    automl_json = h2o.api("GET /99/AutoML/%s" % project_name)
    project_name = automl_json["project_name"]
    leaderboard_list = [key["name"] for key in automl_json['leaderboard']['models']]

    if leaderboard_list is not None and len(leaderboard_list) > 0:
        leader_id = leaderboard_list[0]
    else:
        leader_id = None

    leader = h2o.get_model(leader_id)
    # Intentionally mask the progress bar here since showing multiple progress bars is confusing to users.
    # If any failure happens, revert back to user's original setting for progress and display the error message.
    is_progress = H2OJob.__PROGRESS_BAR__
    h2o.no_progress()
    try:
        # Parse leaderboard H2OTwoDimTable & return as an H2OFrame
        leaderboard = h2o.H2OFrame(
            automl_json["leaderboard_table"].cell_values,
            column_names=automl_json["leaderboard_table"].col_header)
    except Exception as ex:
        raise ex
    finally:
        if is_progress is True:
            h2o.show_progress()

    leaderboard = leaderboard[1:]
    automl_dict = {'project_name': project_name, "leader": leader, "leaderboard": leaderboard}
    return automl_dict
Esempio n. 15
0
 def confusion_matrix(self, data):
     """
     Returns a confusion matrix based of H2O's default prediction threshold for a dataset
     """
     assert_is_type(data, H2OFrame)
     j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self._id, data.frame_id))
     return j["model_metrics"][0]["cm"]["table"]
Esempio n. 16
0
    def _fetch(self):
        res = h2o.api("GET /99/AutoML/" + self._automl_key)
        leaderboard_list = [key["name"] for key in res['leaderboard']['models']]

        if leaderboard_list is not None and len(leaderboard_list) > 0:
            self._leader_id = leaderboard_list[0]
        else:
            self._leader_id = None

        # Intentionally mask the progress bar here since showing multiple progress bars is confusing to users.
        # If any failure happens, revert back to user's original setting for progress and display the error message.
        is_progress = H2OJob.__PROGRESS_BAR__
        h2o.no_progress()
        try:
            # Parse leaderboard H2OTwoDimTable & return as an H2OFrame
            leaderboard = h2o.H2OFrame(
                res["leaderboard_table"].cell_values,
                column_names=res["leaderboard_table"].col_header)
        except Exception as ex:
            raise ex
        finally:
            if is_progress is True:
                h2o.show_progress()

        self._leaderboard = leaderboard[1:]
        return self._leader_id is not None
Esempio n. 17
0
    def download_mojo(self, path=".", get_genmodel_jar=False):
        """
        Download the model in MOJO format.

        :param path: the path where MOJO file should be saved.
        :param get_genmodel_jar: if True, then also download h2o-genmodel.jar and store it in folder ``path``.
        :returns: name of the MOJO file written.
        """
        assert_is_type(path, str)
        assert_is_type(get_genmodel_jar, bool)
        if self.algo not in {"drf", "gbm", "deepwater", "glrm"}:
            raise H2OValueError("MOJOs are currently supported for Distributed Random Forest, "
                                "Gradient Boosting Machine, Deep Water and GLRM models only.")
        if get_genmodel_jar:
            h2o.api("GET /3/h2o-genmodel.jar", save_to=os.path.join(path, "h2o-genmodel.jar"))
        return h2o.api("GET /3/Models/%s/mojo" % self.model_id, save_to=path)
Esempio n. 18
0
def _model_build(x, y, tframe, vframe, algo, kwargs):
  kwargs['training_frame'] = tframe
  if vframe is not None: kwargs["validation_frame"] = vframe
  if y is not None:  kwargs['response_column'] = tframe[y].names[0]
  kwargs = dict([(k, (kwargs[k]._frame()).frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None])  # gruesome one-liner
  rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else 3
  future_model = H2OModelFuture(H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, algo), data=kwargs), job_type=(algo+" Model Build")), x)
  return _resolve_model(future_model, _rest_version=rest_ver, **kwargs)
Esempio n. 19
0
 def fit(self, fr, **fit_params):
     res = []
     for step in self.steps:
         res.append(step[1].to_rest(step[0]))
     res = "[" + ",".join([quoted(r.replace('"', "'")) for r in res]) + "]"
     j = h2o.api("POST /99/Assembly", data={"steps": res, "frame": fr.frame_id})
     self.id = j["assembly"]["name"]
     return H2OFrame.get_frame(j["result"]["name"])
Esempio n. 20
0
 def join(self):
     """Wait until job's completion."""
     self._future = False
     self._job.poll()
     model_key = self._job.dest_key
     self._job = None
     model_json = h2o.api("GET /%d/Models/%s" % (self._rest_version, model_key))["models"][0]
     self._resolve_model(model_key, model_json)
Esempio n. 21
0
    def confusion_matrix(self, data):
        """
        Returns a confusion matrix based of H2O's default prediction threshold for a dataset.

        :param H2OFrame data: the frame with the prediction results for which the confusion matrix should be extracted.
        """
        assert_is_type(data, H2OFrame)
        j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self._id, data.frame_id))
        return j["model_metrics"][0]["cm"]["table"]
Esempio n. 22
0
    def rapids(expr):
        """
        Execute a Rapids expression.

        :param expr: The rapids expression (ascii string).

        :returns: The JSON response (as a python dictionary) of the Rapids execution
        """
        return h2o.api("POST /99/Rapids", data={"ast": expr, "session_id": h2o.connection().session_id})
Esempio n. 23
0
 def available():
     """Returns True if a deep water model can be built, or False otherwise."""
     builder_json = h2o.api("GET /3/ModelBuilders", data={"algo": "deepwater"})
     visibility = builder_json["model_builders"]["deepwater"]["visibility"]
     if visibility == "Experimental":
         print("Cannot build a Deep Water model - no backend found.")
         return False
     else:
         return True
Esempio n. 24
0
    def shutdown(self, prompt=False):
        """
        Shut down the server.

        This method checks if the H2O cluster is still running, and if it does shuts it down (via a REST API call).

        :param prompt: A logical value indicating whether to prompt the user before shutting down the H2O server.
        """
        if not self.is_running(): return
        assert_is_type(prompt, bool)
        if prompt:
            question = "Are you sure you want to shutdown the H2O instance running at %s (Y/N)? " \
                       % h2o.connection().base_url
            response = input(question)  # works in Py2 & Py3 because redefined in h2o.utils.compatibility module
        else:
            response = "Y"
        if response.lower() in {"y", "yes"}:
            h2o.api("POST /3/Shutdown")
            h2o.connection().close()
Esempio n. 25
0
 def _refresh_job_status(self):
     if self._poll_count <= 0: raise StopIteration("")
     jobs = h2o.api("GET /3/Jobs/%s" % self.job_key)
     self.job = jobs["jobs"][0] if "jobs" in jobs else jobs["job"][0]
     self.status = self.job["status"]
     self.progress = min(self.job["progress"], 1)
     self.exception = self.job["exception"]
     self.warnings = self.job["warnings"] if "warnings" in self.job else None
     self._poll_count -= 1
     return self.progress
Esempio n. 26
0
 def fill(self, rows=10):
     assert self._id is not None
     if self._data is not None:
         if rows <= len(self):
             return
     res = h2o.api("GET /3/Frames/%s" % self._id, data={"row_count": rows})["frames"][0]
     self._l = rows
     self._nrows = res["rows"]
     self._ncols = res["total_column_count"]
     self._names = [c["label"] for c in res["columns"]]
     self._types = dict(zip(self._names, [c["type"] for c in res["columns"]]))
     self._fill_data(res)
Esempio n. 27
0
    def fit(self, fr):
        """
        To perform the munging operations on a frame specified in steps on the frame fr.

        :param fr: H2OFrame where munging operations are to be performed on.
        :return: H2OFrame after munging operations are completed.
        """
        assert_is_type(fr, H2OFrame)
        steps = "[%s]" % ",".join(quoted(step[1].to_rest(step[0]).replace('"', "'")) for step in self.steps)
        j = h2o.api("POST /99/Assembly", data={"steps": steps, "frame": fr.frame_id})
        self.id = j["assembly"]["name"]
        return H2OFrame.get_frame(j["result"]["name"])
Esempio n. 28
0
    def deepfeatures(self, test_data, layer):
        """
        Return hidden layer details.

        :param test_data: Data to create a feature space on
        :param layer: 0 index hidden layer
        """
        if test_data is None: raise ValueError("Must specify test data")
        j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self._id, test_data.frame_id),
                           data={"deep_features_hidden_layer": layer}), "deepfeatures")
        j.poll()
        return h2o.get_frame(j.dest_key)
Esempio n. 29
0
    def predict_leaf_node_assignment(self, test_data):
        """
        Predict on a dataset and return the leaf node assignment (only for tree-based models).

        :param H2OFrame test_data: Data on which to make predictions.

        :returns: A new H2OFrame of predictions.
        """
        if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame")
        j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id),
                    data={"leaf_node_assignment": True})
        return h2o.get_frame(j["predictions_frame"]["name"])
Esempio n. 30
0
    def predict(self, test_data):
        """
        Predict on a dataset.

        :param H2OFrame test_data: Data on which to make predictions.

        :returns: A new H2OFrame of predictions.
        """
        if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame")
        j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id)),
                   self._model_json['algo'] + " prediction")
        j.poll()
        return h2o.get_frame(j.dest_key)
Esempio n. 31
0
    def fit(self, fr):
        """
        To perform the munging operations on a frame specified in steps on the frame fr.

        :param fr: H2OFrame where munging operations are to be performed on.
        :return: H2OFrame after munging operations are completed.

        :examples:

        >>> iris = h2o.load_dataset("iris")
        >>> assembly = H2OAssembly(steps=[("col_select",
        ...                        H2OColSelect(["Sepal.Length",
        ...                        "Petal.Length", "Species"])),
        ...                       ("cos_Sepal.Length",
        ...                        H2OColOp(op=H2OFrame.cos,
        ...                        col="Sepal.Length",
        ...                        inplace=True)),
        ...                       ("str_cnt_Species",
        ...                        H2OColOp(op=H2OFrame.countmatches,
        ...                        col="Species",
        ...                        inplace=False,
        ...                        pattern="s"))])
        >>> fit = assembly.fit(iris)
        >>> fit

        """
        assert_is_type(fr, H2OFrame)
        steps = "[%s]" % ",".join(
            quoted(step[1].to_rest(step[0]).replace('"', "'"))
            for step in self.steps)
        j = h2o.api("POST /99/Assembly",
                    data={
                        "steps": steps,
                        "frame": fr.frame_id
                    })
        self.id = j["assembly"]["name"]
        return H2OFrame.get_frame(j["result"]["name"])
Esempio n. 32
0
    def transform(self, frame, data_leakage_handling="None", noise=-1, seed=-1):
        """

        Apply transformation to `te_columns` based on the encoding maps generated during `trains()` method call.

        :param H2OFrame frame: to which frame we are applying target encoding transformations.
        :param str data_leakage_handling: Supported options:

        1) "KFold" - encodings for a fold are generated based on out-of-fold data.
        2) "LeaveOneOut" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies.
        3) "None" - we do not holdout anything. Using whole frame for training

        :param float noise: the amount of random noise added to the target encoding.  This helps prevent overfitting. Defaults to 0.01 * range of y.
        :param int seed: a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1.

        :example:
        >>> targetEncoder = TargetEncoder(encoded_columns=te_columns, target_column=responseColumnName, blended_avg=True, inflection_point=10, smoothing=20)
                            >>> encodedTrain = targetEncoder.transform(frame=trainFrame, data_leakage_handling="None", seed=1234, is_train_or_valid=True)
        """
        output = h2o.api("GET /3/TargetEncoderTransform", data={'model': self.model_id, 'frame': frame.key,
                                                                'data_leakage_handling': data_leakage_handling,
                                                                'noise': noise,
                                                                'seed': seed})
        return h2o.get_frame(output["name"])
Esempio n. 33
0
    def transform(self, words, aggregate_method):
        """
        Transform words (or sequences of words) to vectors using a word2vec model.

        :param str words: An H2OFrame made of a single column containing source words.
        :param str aggregate_method: Specifies how to aggregate sequences of words. If method is `NONE`
               then no aggregation is performed and each input word is mapped to a single word-vector.
               If method is 'AVERAGE' then input is treated as sequences of words delimited by NA.
               Each word of a sequences is internally mapped to a vector and vectors belonging to
               the same sentence are averaged and returned in the result.

        :returns: the approximate reconstruction of the training data.

        :examples:

        >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), 
        ...                               col_names = ["category", "jobtitle"], 
        ...                               col_types = ["string", "string"], 
        ...                               header = 1)
        >>> STOP_WORDS = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what",
        ...               "there","all","we","one","the","a","an","of","or","in","for","by","on",
        ...               "but","is","in","a","not","with","as","was","if","they","are","this","and","it","have",
        ...               "from","at","my","be","by","not","that","to","from","com","org","like","likes","so"]
        >>> words = job_titles.tokenize(" ")
        >>> words = words[(words.isna()) | (~ words.isin(STOP_WORDS)),:] 
        >>> w2v_model = H2OWord2vecEstimator(epochs = 10)
        >>> w2v_model.train(training_frame=words)
        >>> job_title_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")
        """
        j = h2o.api("GET /3/Word2VecTransform",
                    data={
                        'model': self.model_id,
                        'words_frame': words.frame_id,
                        'aggregate_method': aggregate_method
                    })
        return h2o.get_frame(j["vectors_frame"]["name"])
Esempio n. 34
0
def _resolve_model(future_model, **kwargs):
    future_model.poll()
    rest_ver = kwargs["_rest_version"] if "_rest_version" in kwargs else 3
    model_json = h2o.api("GET /%d/Models/%s" %
                         (rest_ver, future_model.job.dest_key))["models"][0]

    model_type = model_json["output"]["model_category"]
    if model_type == "Binomial":
        model = H2OBinomialModel(future_model.job.dest_key, model_json)
    elif model_type == "Clustering":
        model = H2OClusteringModel(future_model.job.dest_key, model_json)
    elif model_type == "Regression":
        model = H2ORegressionModel(future_model.job.dest_key, model_json)
    elif model_type == "Multinomial":
        model = H2OMultinomialModel(future_model.job.dest_key, model_json)
    elif model_type == "AutoEncoder":
        model = H2OAutoEncoderModel(future_model.job.dest_key, model_json)
    elif model_type == "DimReduction":
        model = H2ODimReductionModel(future_model.job.dest_key, model_json)
    elif model_type == "WordEmbedding":
        model = H2OWordEmbeddingModel(future_model.job.dest_key, model_json)
    else:
        raise NotImplementedError(model_type)
    return model
Esempio n. 35
0
    def __init__(self, model, tree_number, tree_class=None):
        params = {"model": model.model_id,
                  "tree_number": tree_number,
                  "tree_class": tree_class}
        response = h2o.api(endpoint="GET /3/Tree", data=params)

        self._left_children = response['left_children']
        self._right_children = response['right_children']
        self._node_ids = self.__extract_internal_ids(response['root_node_id'])
        self._descriptions = response['descriptions']
        self._model_id = model.model_id
        self._tree_number = response['tree_number']
        self._tree_class = response['tree_class']
        self._thresholds = self.__convert_threshold_nans(response['thresholds'])
        self._features = response['features']
        self._levels = self.__decode_categoricals(model, response['levels'])
        self._nas = response['nas']
        self._predictions = response['predictions']
        self._root_node = self.__assemble_tree(0)
        self._tree_decision_path = response['tree_decision_path']
        self._decision_paths = response['decision_paths']
        (left, right) = self.__per_node_cat_splits()
        self._left_cat_split = left
        self._right_cat_split = right
Esempio n. 36
0
    def model_performance(self, test_data=None, train=False, valid=False, xval=False):
        """
        Generate model metrics for this model on test_data.

        Parameters
        ----------
        test_data: H2OFrame, optional
          Data set for which model metrics shall be computed against. All three of train, valid and xval arguments are
          ignored if test_data is not None.
        train: boolean, optional
          Report the training metrics for the model.
        valid: boolean, optional
          Report the validation metrics for the model.
        xval: boolean, optional
          Report the cross-validation metrics for the model. If train and valid are True, then it defaults to True.

        :returns: An object of class H2OModelMetrics.
        """
        if test_data is None:
            if not train and not valid and not xval: train = True  # default to train
            if train: return self._model_json["output"]["training_metrics"]
            if valid: return self._model_json["output"]["validation_metrics"]
            if xval: return self._model_json["output"]["cross_validation_metrics"]

        else:  # cases dealing with test_data not None
            if not isinstance(test_data, h2o.H2OFrame):
                raise ValueError("`test_data` must be of type H2OFrame.  Got: " + type(test_data))
            res = h2o.api("POST /3/ModelMetrics/models/%s/frames/%s" % (self.model_id, test_data.frame_id))

            # FIXME need to do the client-side filtering...  (PUBDEV-874)
            raw_metrics = None
            for mm in res["model_metrics"]:
                if mm["frame"] is not None and mm["frame"]["name"] == test_data.frame_id:
                    raw_metrics = mm
                    break
            return self._metrics_class(raw_metrics, algo=self._model_json["algo"])
Esempio n. 37
0
    def __init__(self,
                 nfolds=5,
                 balance_classes=False,
                 class_sampling_factors=None,
                 max_after_balance_size=5.0,
                 max_runtime_secs=3600,
                 max_models=None,
                 stopping_metric="AUTO",
                 stopping_tolerance=None,
                 stopping_rounds=3,
                 seed=None,
                 project_name=None,
                 exclude_algos=None,
                 keep_cross_validation_predictions=True,
                 keep_cross_validation_models=True,
                 sort_metric="AUTO"):

        # Check if H2O jar contains AutoML
        try:
            h2o.api("GET /3/Metadata/schemas/AutoMLV99")
        except h2o.exceptions.H2OResponseError as e:
            print(e)
            print("*******************************************************************\n" \
                  "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \
                  "*******************************************************************\n" \
                  "\nVerbose Error Message:")

        
        # Make bare minimum build_control (if max_runtimes_secs is an invalid value, it will catch below)
        self.build_control = {
            'stopping_criteria': {
                'max_runtime_secs': max_runtime_secs,
            }
        }

        # Make bare minimum build_models
        self.build_models = {
            'exclude_algos': None
            #                [ "GLM", "DRF", "GBM", "DeepLearning", "StackedEnsemble"]
        }

        # nfolds must be an non-negative integer and not equal to 1:
        if nfolds is not 5:
            assert_is_type(nfolds,int)
        assert nfolds >= 0, "nfolds set to " + str(nfolds) + "; nfolds cannot be negative. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable."
        assert nfolds is not 1, "nfolds set to " + str(nfolds) + "; nfolds = 1 is an invalid value. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable."           
        self.build_control["nfolds"] = nfolds 
        self.nfolds = nfolds

        # Pass through to all algorithms
        if balance_classes is True:
            self.build_control["balance_classes"] = balance_classes
            self.balance_classes = balance_classes
        if class_sampling_factors is not None:
            self.build_control["class_sampling_factors"] = class_sampling_factors
            self.class_sampling_factors = class_sampling_factors
        if max_after_balance_size != 5.0:
            assert_is_type(max_after_balance_size,float)
            self.build_control["max_after_balance_size"] = max_after_balance_size
            self.max_after_balance_size = max_after_balance_size      

        # If max_runtime_secs is not provided, then it is set to default (3600 secs)
        if max_runtime_secs is not 3600:
            assert_is_type(max_runtime_secs,int)
        self.max_runtime_secs = max_runtime_secs

        # Add other parameters to build_control if available
        if max_models is not None:
            assert_is_type(max_models,int)
            self.build_control["stopping_criteria"]["max_models"] = max_models
        self.max_models = max_models

        if stopping_metric is not "AUTO":
            assert_is_type(stopping_metric,str)
        self.build_control["stopping_criteria"]["stopping_metric"] = stopping_metric
        self.stopping_metric = stopping_metric

        if stopping_tolerance is not None:
            assert_is_type(stopping_tolerance,float)
            self.build_control["stopping_criteria"]["stopping_tolerance"] = stopping_tolerance
        self.stopping_tolerence = stopping_tolerance

        if stopping_rounds is not 3:
            assert_is_type(stopping_rounds,int)
        self.build_control["stopping_criteria"]["stopping_rounds"] = stopping_rounds
        self.stopping_rounds = stopping_rounds    

        if seed is not None:
            assert_is_type(seed,int)
            self.build_control["stopping_criteria"]["seed"] = seed
            self.seed = seed

        # Set project name if provided. If None, then we set in .train() to "automl_" + training_frame.frame_id
        if project_name is not None:
            assert_is_type(project_name,str)
            self.build_control["project_name"] = project_name
            self.project_name = project_name
        else:
            self.project_name = None

        if exclude_algos is not None:
            assert_is_type(exclude_algos,list)
            for elem in exclude_algos:
                assert_is_type(elem,str)
            self.build_models['exclude_algos'] = exclude_algos

        assert_is_type(keep_cross_validation_predictions, bool)
        self.build_control["keep_cross_validation_predictions"] = keep_cross_validation_predictions

        assert_is_type(keep_cross_validation_models, bool)
        self.build_control["keep_cross_validation_models"] = keep_cross_validation_models

        self._job = None
        self._leader_id = None
        self._leaderboard = None
        if sort_metric == "AUTO":
            self.sort_metric = None
        else:
            self.sort_metric = sort_metric
Esempio n. 38
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              validation_frame=None,
              test_frame=None):
        """
        Begins the automl task, which is a background task that incrementally improves
        over time. At any point, the user may use the "predict"/"performance"
        to inspect the incremental

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param test_frame: H2OFrame with test data to be scored on in the leaderboard.

        :returns: An H2OAutoML object.

        :examples:
        >>> # Set up an H2OAutoML object
        >>> # Setting up an H2OAutoML object
        >>> build_control = {
        >>>              'stopping_criteria': {
        >>>              'stopping_rounds': 3,
        >>>              'stopping_tolerance': 0.001
        >>>            }
        >>>        }
        >>> aml = H2OAutoML(max_runtime_secs=30, build_control=build_control)
        >>> # Launch H2OAutoML
        >>> aml.train(y=y, training_frame=training_frame)
        """
        ncols = training_frame.ncols
        names = training_frame.names
        #Minimal required arguments are training_frame and y (response)
        if y is None:
            raise ValueError(
                'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.'
            )
        else:
            assert_is_type(y, int, str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError(
                        "Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError(
                        "Column %s does not exist in the training frame" % y)
            input_spec = {
                'response_column': y,
            }

        if training_frame is None:
            raise ValueError('The training frame is not set!')
        else:
            assert_is_type(training_frame, H2OFrame)
            input_spec['training_frame'] = training_frame.frame_id

        if validation_frame is not None:
            assert_is_type(training_frame, H2OFrame)
            input_spec['validation_frame'] = validation_frame.frame_id

        if test_frame is not None:
            assert_is_type(training_frame, H2OFrame)
            input_spec['test_frame'] = test_frame.frame_id

        if x is not None:
            assert_is_type(x, list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError(
                            "Column %d does not exist in the training frame" %
                            xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError(
                            "Column %s not in the training frame" % xi)
                    xset.add(xi)
            x = list(xset)
            ignored_columns = set(names) - {y} - set(x)
            input_spec['ignored_columns'] = list(ignored_columns)

        automl_build_params = dict(input_spec=input_spec)

        # NOTE: if the user hasn't specified some block of parameters don't send them!
        # This lets the back end use the defaults.
        automl_build_params['build_control'] = self.build_control

        resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params)
        if 'job' not in resp:
            print("Exception from the back end: ")
            print(resp)
            return

        self._job = H2OJob(resp['job'], "AutoML")
        self._automl_key = self._job.dest_key
        self._job.poll()
        self._fetch()
        if self.project_name is None:
            self.project_name = "automl_" + training_frame.frame_id
Esempio n. 39
0
    def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False, extend_parms_fn=None):
        has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None
        training_frame = H2OFrame._validate(training_frame, 'training_frame',
                                            required=self._requires_training_frame() and not has_default_training_frame)
        validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)
        assert_is_type(extend_parms_fn, None, FunctionType)

        override_default_training_frame = training_frame is not None
        if not override_default_training_frame:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)
            training_frame = self.training_frame if has_default_training_frame else None

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if algo=="pca" and "k" not in parms.keys():
            parms["k"] = 1
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"})

        names = training_frame.names if training_frame is not None else []
        ncols = training_frame.ncols if training_frame is not None else 0
        types = training_frame.types if training_frame is not None else {}

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if override_default_training_frame:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is None and "ignored_columns" in parms:
                ignored_columns = parms['ignored_columns']
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)
            self._check_and_save_parm(parms, "offset_column", offset_column)
            self._check_and_save_parm(parms, "weights_column", weights_column)
            self._check_and_save_parm(parms, "fold_column", fold_column)

        if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"}
        if is_auto_encoder and y is not None:
            raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None and self.algo not in ["generic"]:
            raise ValueError("Missing response")

        # Step 3
        if override_default_training_frame:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None:
            parms["validation_frame"] = validation_frame

        if is_type(y, int):
            y = names[y]
        if y is not None:
            parms["response_column"] = y
        if not isinstance(x, (list, tuple)):
            x = [x]
        if is_type(x[0], int):
            x = [names[i] for i in x]
        if override_default_training_frame:
            ignored_columns = list(set(names) - set(x + [y, offset, folds, weights] + self._additional_used_columns(parms)))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None
                                 else [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None
                                      else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])
    
        # internal hook allowing subclasses to extend train parms 
        if extend_parms_fn is not None:
            extend_parms_fn(parms)
            
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]):
            raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(poll_updates=self._print_model_scoring_history if verbose else None)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Esempio n. 40
0
 def network_test(self):
     """Test network connectivity."""
     res = h2o.api("GET /3/NetworkTest")
     res["table"].show()
Esempio n. 41
0
 def _list_extensions(self, endpoint):
     res = h2o.api("GET /3/" + endpoint)["capabilities"]
     return [x["name"] for x in res]
Esempio n. 42
0
    def train(self, x=None, y=None, training_frame=None, fold_column=None,
              weights_column=None, validation_frame=None, leaderboard_frame=None, blending_frame=None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold_column or weights_column).
        :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets 
            nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used 
            for early stopping of individual models and early stopping of the grid searches.  By default and 
            when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored.
        :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard.  This is optional and
            if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard 
            rankings instead.
        :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values).
            This is optional, but when provided, it is also recommended to disable cross validation 
            by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes.

        :returns: An H2OAutoML object.

        :examples:
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        """
        training_frame = H2OFrame._validate(training_frame, 'training_frame', required=True)
        ncols = training_frame.ncols
        names = training_frame.names

        # Minimal required arguments are training_frame and y (response)
        if y is None:
            raise H2OValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.')
        else:
            assert_is_type(y,int,str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            input_spec = {
                'response_column': y,
            }

        input_spec['training_frame'] = training_frame.frame_id

        if fold_column is not None:
            assert_is_type(fold_column,int,str)
            input_spec['fold_column'] = fold_column

        if weights_column is not None:
            assert_is_type(weights_column,int,str)
            input_spec['weights_column'] = weights_column

        if validation_frame is not None:
            validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
            input_spec['validation_frame'] = validation_frame.frame_id

        if leaderboard_frame is not None:
            leaderboard_frame = H2OFrame._validate(leaderboard_frame, 'leaderboard_frame')
            input_spec['leaderboard_frame'] = leaderboard_frame.frame_id

        if blending_frame is not None:
            blending_frame = H2OFrame._validate(blending_frame, 'blending_frame')
            input_spec['blending_frame'] = blending_frame.frame_id

        if self.sort_metric is not None:
            assert_is_type(self.sort_metric, str)
            sort_metric = self.sort_metric.lower()
            # Changed the API to use "deviance" to be consistent with stopping_metric values
            # TO DO: let's change the backend to use "deviance" since we use the term "deviance"
            # After that we can take this `if` statement out
            if sort_metric == "deviance":
                sort_metric = "mean_residual_deviance"
            input_spec['sort_metric'] = sort_metric

        if x is not None:
            assert_is_type(x,list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
            x = list(xset)
            ignored_columns = set(names) - {y} - set(x)
            if fold_column is not None and fold_column in ignored_columns:
                ignored_columns.remove(fold_column)
            if weights_column is not None and weights_column in ignored_columns:
                ignored_columns.remove(weights_column)
            if ignored_columns is not None:
                input_spec['ignored_columns'] = list(ignored_columns)

        automl_build_params = dict(input_spec=input_spec)

        # NOTE: if the user hasn't specified some block of parameters don't send them!
        # This lets the back end use the defaults.
        automl_build_params['build_control'] = self.build_control
        automl_build_params['build_models'] = self.build_models

        resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params)
        if 'job' not in resp:
            print("Exception from the back end: ")
            print(resp)
            return

        if not self.project_name:
            self.build_control['project_name'] = self.project_name = resp['build_control']['project_name']

        self._job = H2OJob(resp['job'], "AutoML")
        poll_updates = ft.partial(self._poll_training_updates, verbosity=self._verbosity, state={})
        try:
            self._job.poll(poll_updates=poll_updates)
        finally:
            poll_updates(self._job, 1)

        self._fetch()
Esempio n. 43
0
def test_zipped_rf_model():
    """
    Test the correctness of the "zipped" model format.

    This test will create a random dataset, split into training/testing part, train a DRF model on it,
    download the model's data, score the model remotely and fetch the predictions, score the model locally by
    running the genmodel jar, and finally compare the prediction results.
    """
    genmodel_jar = os.path.abspath("../../../h2o-genmodel/build/libs/h2o-genmodel-all.jar")
    assert os.path.exists(genmodel_jar), "Cannot find " + genmodel_jar

    target_dir = ""
    if sys.platform == "win32":
        target_dir = tempfile.mkdtemp()
    else:
        target_dir = os.path.expanduser("~/Downloads/")

    report = []
    for estimator in [H2ORandomForestEstimator, H2OGradientBoostingEstimator]:
        print(colorama.Fore.LIGHTYELLOW_EX + "\n#================================================")
        print("#  Estimator: " + estimator.__name__)
        print("#================================================\n" + colorama.Fore.RESET)
        estimator_name = "GBM" if estimator == H2OGradientBoostingEstimator else "DRF"
        for problem in ["binomial", "multinomial", "regression"]:
            print("========================")
            print("%s problem" % problem.capitalize())
            print("========================")
            df = random_dataset(problem, verbose=False)
            print("Created dataset with %d rows x %d columns" % (df.nrow, df.ncol))
            test = df[:NTESTROWS, :]
            train = df[NTESTROWS:, :]
            test2 = test.rbind(test)

            time0 = time.time()
            print("\n\nTraining Random Forest model...")
            model = estimator(ntrees=NTREES, max_depth=DEPTH)
            model.train(training_frame=train)
            print(model.summary())
            print("Time taken = %.3fs" % (time.time() - time0))

            print("\nSaving the model...")
            time0 = time.time()
            model_file = h2o.api("GET /3/Models/%s/data" % model.model_id, save_to=target_dir)
            print("    => %s  (%d bytes)" % (model_file, os.stat(model_file).st_size))
            assert os.path.exists(model_file)
            print("Time taken = %.3fs" % (time.time() - time0))

            print("\nDownloading POJO...")
            time0 = time.time()
            pojo_file = h2o.download_pojo(model, target_dir, get_jar=False)
            pojo_size = os.stat(pojo_file).st_size
            pojo_name = os.path.splitext(os.path.basename(pojo_file))[0]
            print("    => %s  (%d bytes)" % (pojo_file, pojo_size))
            print("Time taken = %.3fs" % (time.time() - time0))

            print("\nDownloading the test datasets for local use: ", end="")
            time0 = time.time()
            test_file = os.path.join(target_dir, "test_%s.csv" % test.frame_id)
            test2_file = os.path.join(target_dir, "test2_%s.csv" % test2.frame_id)
            print(test_file)
            h2o.download_csv(test, test_file)
            h2o.download_csv(test2, test2_file)
            print("Time taken = %.3fs" % (time.time() - time0))

            print("\nScoring the model remotely and downloading to file ", end="")
            times = [time.time()]
            h2o_pred_file = os.path.join(target_dir, "predR_%s.csv" % test.frame_id)
            h2o_pred_file2 = os.path.join(target_dir, "predR_%s.csv" % test2.frame_id)
            print(h2o_pred_file)
            for testframe, outfile in [(test, h2o_pred_file), (test2, h2o_pred_file2)]:
                predictions = model.predict(testframe)
                h2o.download_csv(predictions, outfile)
                times.append(time.time())
            print("Time taken = %.3fs   (1st run: %.3f, 2nd run: %.3f)" %
                  (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1]))
            report.append((estimator_name, problem, "Server", times[1] - times[0], times[2] - times[1]))

            print("\nScoring the model locally and saving to file ", end="")
            times = [time.time()]
            local_pred_file = os.path.join(target_dir, "predL_%s.csv" % test.frame_id)
            local_pred_file2 = os.path.join(target_dir, "predL_%s.csv" % test2.frame_id)
            print(local_pred_file)
            for inpfile, outfile in [(test_file, local_pred_file), (test2_file, local_pred_file2)]:
                load_csv(inpfile)
                ret = subprocess.call(["java", "-cp", genmodel_jar,
                                       "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m",
                                       "hex.genmodel.tools.PredictCsv",
                                       "--input", inpfile, "--output", outfile, "--model", model_file, "--decimal"])
                assert ret == 0, "GenModel finished with return code %d" % ret
                times.append(time.time())
            print("Time taken = %.3fs   (1st run: %.3f, 2nd run: %.3f)" %
                  (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1]))
            report.append((estimator_name, problem, "Zipped", times[1] - times[0], times[2] - times[1]))

            if pojo_size <= 1000 << 20:  # 1000 Mb
                time0 = time.time()
                print("\nCompiling Java Pojo")
                javac_cmd = ["javac", "-cp", genmodel_jar, "-J-Xmx12g", pojo_file]
                subprocess.check_call(javac_cmd)
                print("Time taken = %.3fs" % (time.time() - time0))

                pojo_pred_file = os.path.join(target_dir, "predP_%s.csv" % test.frame_id)
                pojo_pred_file2 = os.path.join(target_dir, "predP_%s.csv" % test2.frame_id)
                print("Scoring POJO and saving to file %s" % pojo_pred_file)
                times = [time.time()]
                cp_sep = ";" if sys.platform == "win32" else ":"
                for inpfile, outfile in [(test_file, pojo_pred_file), (test2_file, pojo_pred_file2)]:
                    load_csv(inpfile)
                    java_cmd = ["java", "-cp", cp_sep.join([genmodel_jar, target_dir]),
                                "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", "-XX:MaxPermSize=256m",
                                "hex.genmodel.tools.PredictCsv",
                                "--pojo", pojo_name, "--input", inpfile, "--output", outfile, "--decimal"]
                    ret = subprocess.call(java_cmd)
                    assert ret == 0, "GenModel finished with return code %d" % ret
                    times.append(time.time())
                print("Time taken = %.3fs   (1st run: %.3f, 2nd run: %.3f)" %
                      (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1]))
                report.append((estimator_name, problem, "POJO", times[1] - times[0], times[2] - times[1]))


            print("\nChecking whether the predictions coincide...")
            time0 = time.time()
            local_pred = load_csv(local_pred_file)
            server_pred = load_csv(h2o_pred_file)
            pojo_pred = load_csv(pojo_pred_file) if pojo_pred_file else local_pred
            assert len(local_pred) == len(server_pred) == len(pojo_pred) == test.nrow, \
                "Number of rows in prediction files do not match: %d vs %d vs %d vs %d" % \
                (len(local_pred), len(server_pred), len(pojo_pred), test.nrow)
            for i in range(test.nrow):
                lpred = local_pred[i]
                rpred = server_pred[i]
                ppred = pojo_pred[i]
                assert type(lpred) == type(rpred) == type(ppred), \
                    "Types of predictions do not match: %r / %r / %r" % (lpred, rpred, ppred)
                if isinstance(lpred, float):
                    same = abs(lpred - rpred) + abs(lpred - ppred) < 1e-8
                else:
                    same = lpred == rpred == ppred
                assert same, \
                    "Predictions are different for row %d: local=%r, pojo=%r, bomo=%r" % (i + 1, lpred, ppred, rpred)
            print("Time taken = %.3fs" % (time.time() - time0))
            print(colorama.Fore.LIGHTGREEN_EX + "\nPredictions match!\n" + colorama.Fore.RESET)

    print(colorama.Fore.LIGHTYELLOW_EX + "\n\n#================================================")
    print("#  Timing report")
    print("#================================================\n" + colorama.Fore.RESET)
    print(tabulate.tabulate(report,
          headers=["Model", "Problem type", "Scorer", "10000 rows", "20000 rows"],
          floatfmt=".3f"), end="\n\n\n")
Esempio n. 44
0
    def __init__(self,
                 nfolds=5,
                 balance_classes=False,
                 class_sampling_factors=None,
                 max_after_balance_size=5.0,
                 max_runtime_secs=None,
                 max_runtime_secs_per_model=None,
                 max_models=None,
                 stopping_metric="AUTO",
                 stopping_tolerance=None,
                 stopping_rounds=3,
                 seed=None,
                 project_name=None,
                 exclude_algos=None,
                 include_algos=None,
                 exploitation_ratio=-1,
                 modeling_plan=None,
                 preprocessing=None,
                 monotone_constraints=None,
                 keep_cross_validation_predictions=False,
                 keep_cross_validation_models=False,
                 keep_cross_validation_fold_assignment=False,
                 sort_metric="AUTO",
                 export_checkpoints_dir=None,
                 verbosity="warn",
                 **kwargs):
        """
        Create a new H2OAutoML instance.
        
        :param int nfolds: Number of folds for k-fold cross-validation.
            Use ``0`` to disable cross-validation; this will also disable Stacked Ensemble (thus decreasing the overall model performance).
            Defaults to ``5``.

        :param bool balance_classes: Specify whether to oversample the minority classes to balance the class distribution. This option can increase
            the data frame size. This option is only applicable for classification. If the oversampled size of the dataset exceeds the maximum size
            calculated using the ``max_after_balance_size`` parameter, then the majority classes will be undersampled to satisfy the size limit.
            Defaults to ``False``.
        :param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order).
            If not specified, sampling factors will be automatically computed to obtain class balance during training. Requires ``balance_classes`` set to ``True``.
        :param float max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be less than 1.0).
            Requires ``balance_classes``.
            Defaults to ``5.0``.
        :param int max_runtime_secs: Specify the maximum time that the AutoML process will run for.
            If neither ``max_runtime_secs`` nor ``max_models`` are specified by the user, then ``max_runtime_secs`` dynamically
            defaults to 3600 seconds (1 hour). Otherwise, defaults to ``0`` (no limit).
        :param int max_runtime_secs_per_model: Controls the max time the AutoML run will dedicate to each individual model.
            Defaults to ``0`` (disabled: no time limit).
        :param int max_models: Specify the maximum number of models to build in an AutoML run, excluding the Stacked Ensemble models.
            Defaults to ``None`` (disabled: no limitation).
        :param str stopping_metric: Specifies the metric to use for early stopping. 
            The available options are:
            ``"AUTO"`` (This defaults to ``"logloss"`` for classification, ``"deviance"`` for regression),
            ``"deviance"``, ``"logloss"``, ``"mse"``, ``"rmse"``, ``"mae"``, ``"rmsle"``, ``"auc"``, ``aucpr``, ``"lift_top_group"``,
            ``"misclassification"``, ``"mean_per_class_error"``, ``"r2"``.
            Defaults to ``"AUTO"``.
        :param float stopping_tolerance: Specify the relative tolerance for the metric-based stopping criterion to stop a grid search and
            the training of individual models within the AutoML run.
            Defaults to ``0.001`` if the dataset is at least 1 million rows;
            otherwise it defaults to a value determined by the size of the dataset and the non-NA-rate, in which case the value is computed as 1/sqrt(nrows * non-NA-rate).
        :param int stopping_rounds: Stop training new models in the AutoML run when the option selected for
            ``stopping_metric`` doesn't improve for the specified number of models, based on a simple moving average.
            To disable this feature, set it to ``0``.
            Defaults to ``3`` and must be an non-negative integer.
        :param int seed: Set a seed for reproducibility. 
            AutoML can only guarantee reproducibility if ``max_models`` or early stopping is used because ``max_runtime_secs`` is resource limited, 
            meaning that if the resources are not the same between runs, AutoML may be able to train more models on one run vs another.
            In addition, H2O Deep Learning models are not reproducible by default for performance reasons, so ``exclude_algos`` must contain ``DeepLearning``.
            Defaults to ``None``.
        :param str project_name: Character string to identify an AutoML project.
            Defaults to ``None``, which means a project name will be auto-generated based on the training frame ID.
            More models can be trained on an existing AutoML project by specifying the same project name in multiple calls to the AutoML function
            (as long as the same training frame, or a sample, is used in subsequent runs).
        :param exclude_algos: List the algorithms to skip during the model-building phase. 
            The full list of options is:
            
                - ``"DRF"`` (Random Forest and Extremely-Randomized Trees)
                - ``"GLM"``
                - ``"XGBoost"``
                - ``"GBM"``
                - ``"DeepLearning"``
                - ``"StackedEnsemble"``
                
            Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional.
            Usage example::
            
                exclude_algos = ["GLM", "DeepLearning", "DRF"]
                
        :param include_algos: List the algorithms to restrict to during the model-building phase.
            This can't be used in combination with ``exclude_algos`` param.
            Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional.
            Usage example::

                include_algos = ["GLM", "DeepLearning", "DRF"]
                
        :param exploitation_ratio: The budget ratio (between 0 and 1) dedicated to the exploitation (vs exploration) phase.
            By default, the exploitation phase is ``0`` (disabled) as this is still experimental;
            to activate it, it is recommended to try a ratio around 0.1.
            Note that the current exploitation phase only tries to fine-tune the best XGBoost and the best GBM found during exploration.
        :param modeling_plan: List of modeling steps to be used by the AutoML engine (they may not all get executed, depending on other constraints).
            Defaults to ``None`` (Expert usage only).
        :param preprocessing: List of preprocessing steps to run. Only ``["target_encoding"]`` is currently supported. Experimental.
        :param monotone_constraints: A mapping that represents monotonic constraints.
            Use ``+1`` to enforce an increasing constraint and ``-1`` to specify a decreasing constraint.
        :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation predictions.
            This needs to be set to ``True`` if running the same AutoML object for repeated runs because CV predictions are required to build 
            additional Stacked Ensemble models in AutoML. 
            Defaults to ``False``.
        :param keep_cross_validation_models: Whether to keep the cross-validated models.
            Keeping cross-validation models may consume significantly more memory in the H2O cluster.
            Defaults to ``False``.
        :param keep_cross_validation_fold_assignment: Whether to keep fold assignments in the models.
            Deleting them will save memory in the H2O cluster. 
            Defaults to ``False``.
        :param sort_metric: Metric to sort the leaderboard by at the end of an AutoML run. 
            For binomial classification choose between ``"auc"``, ``"aucpr"``, ``"logloss"``, ``"mean_per_class_error"``, ``"rmse"``, ``"mse"``.
            For multinomial classification choose between ``"mean_per_class_error"``, ``"logloss"``, ``"rmse"``, ``"mse"``.
            For regression choose between ``"deviance"``, ``"rmse"``, ``"mse"``, ``"mae"``, ``"rmlse"``.
            Defaults to ``"AUTO"`` (This translates to ``"auc"`` for binomial classification, ``"mean_per_class_error"`` for multinomial classification, ``"deviance"`` for regression).
        :param export_checkpoints_dir: Path to a directory where every model will be stored in binary form.
        :param verbosity: Verbosity of the backend messages printed during training.
            Available options are ``None`` (live log disabled), ``"debug"``, ``"info"``, ``"warn"`` or ``"error"``.
            Defaults to ``"warn"``.
        """

        # early validate kwargs, extracting hidden parameters:
        algo_parameters = {}
        for k in kwargs:
            if k == 'algo_parameters':
                algo_parameters = kwargs[k] or {}
            else:
                raise TypeError(
                    "H2OAutoML got an unexpected keyword argument '%s'" % k)

        # Check if H2O jar contains AutoML
        try:
            h2o.api("GET /3/Metadata/schemas/AutoMLV99")
        except h2o.exceptions.H2OResponseError as e:
            print(e)
            print("*******************************************************************\n" \
                  "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \
                  "*******************************************************************\n" \
                  "\nVerbose Error Message:")

        self._job = None
        self._leader_id = None
        self._leaderboard = None
        self._verbosity = verbosity
        self._event_log = None
        self._training_info = None
        self._state_json = None
        self._build_resp = None  # contains all the actual parameters used on backend

        self.__frozen = False
        self.__input = dict(
        )  # contains all the input params as entered by the user

        # Make bare minimum params containers
        self.build_control = dict()
        self.build_models = dict()
        self.input_spec = dict()

        self.project_name = project_name
        self.nfolds = nfolds
        self.balance_classes = balance_classes
        self.class_sampling_factors = class_sampling_factors
        self.max_after_balance_size = max_after_balance_size
        self.keep_cross_validation_models = keep_cross_validation_models
        self.keep_cross_validation_fold_assignment = keep_cross_validation_fold_assignment
        self.keep_cross_validation_predictions = keep_cross_validation_predictions
        self.export_checkpoints_dir = export_checkpoints_dir

        self.max_runtime_secs = max_runtime_secs
        self.max_runtime_secs_per_model = max_runtime_secs_per_model
        self.max_models = max_models
        self.stopping_metric = stopping_metric
        self.stopping_tolerance = stopping_tolerance
        self.stopping_rounds = stopping_rounds
        self.seed = seed
        self.exclude_algos = exclude_algos
        self.include_algos = include_algos
        self.exploitation_ratio = exploitation_ratio
        self.modeling_plan = modeling_plan
        self.preprocessing = preprocessing
        if monotone_constraints is not None:
            algo_parameters['monotone_constraints'] = monotone_constraints
        self._algo_parameters = algo_parameters

        self.sort_metric = sort_metric
Esempio n. 45
0
    def transform(self,
                  frame,
                  blending=None,
                  inflection_point=None,
                  smoothing=None,
                  noise=None,
                  as_training=False,
                  **kwargs):
        """
        Apply transformation to `te_columns` based on the encoding maps generated during `train()` method call.

        :param H2OFrame frame: the frame on which to apply the target encoding transformations.
        :param boolean blending: If provided, this overrides the `blending` parameter on the model.
        :param float inflection_point: If provided, this overrides the `inflection_point` parameter on the model.
        :param float smoothing: If provided, this overrides the `smoothing` parameter on the model.
        :param float noise: If provided, this overrides the amount of random noise added to the target encoding defined on the model, this helps prevent overfitting.
        :param boolean as_training: Must be set to True when encoding the training frame. Defaults to False.

        :example:
        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> predictors = ["home.dest", "cabin", "embarked"]
        >>> response = "survived"
        >>> titanic[response] = titanic[response].asfactor()
        >>> fold_col = "kfold_column"
        >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234)
        >>> titanic_te = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out",
        ...                                        inflection_point=35,
        ...                                        smoothing=25,
        ...                                        blending=True,
        ...                                        seed=1234)
        >>> titanic_te.train(x=predictors,
        ...                  y=response,
        ...                  training_frame=titanic)
        >>> transformed = titanic_te.transform(frame=titanic)
        """
        for k in kwargs:
            if k in ['seed', 'data_leakage_handling']:
                warnings.warn(
                    "`%s` is deprecated in `transform` method and will be ignored. "
                    "Instead, please ensure that it was set before training on the H2OTargetEncoderEstimator model."
                    % k, H2ODeprecationWarning)
            else:
                raise TypeError(
                    "transform() got an unexpected keyword argument '%s'" % k)

        if 'data_leakage_handling' in kwargs:
            dlh = kwargs['data_leakage_handling']
            assert_is_type(dlh, None, Enum("leave_one_out", "k_fold", "none"))
            if dlh is not None and dlh.lower() != "none":
                warnings.warn(
                    "Deprecated `data_leakage_handling=%s` is replaced by `as_training=True`. "
                    "Please update your code." % dlh, H2ODeprecationWarning)
                as_training = True

        params = dict(
            model=self.model_id,
            frame=frame.key,
            blending=blending if blending is not None else self.
            blending,  # always need to provide blending here as we can't represent unset value 
            inflection_point=inflection_point,
            smoothing=smoothing,
            noise=noise,
            as_training=as_training,
        )

        output = h2o.api("GET /3/TargetEncoderTransform", data=params)
        return h2o.get_frame(output["name"])
Esempio n. 46
0
    def __init__(self,
                 max_runtime_secs=3600,
                 max_models=None,
                 stopping_metric="AUTO",
                 stopping_tolerance=0.001,
                 stopping_rounds=3,
                 seed=None,
                 project_name=None):

        #Check if H2O jar contains AutoML
        try:
            h2o.api("GET /3/Metadata/schemas/AutoMLV99")
        except h2o.exceptions.H2OResponseError as e:
            print(e)
            print("*******************************************************************\n" \
                  "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \
                  "*******************************************************************\n" \
                  "\nVerbose Error Message:")

        #If max_runtime_secs is not provided, then it is set to default (600 secs)
        if max_runtime_secs is not 3600:
            assert_is_type(max_runtime_secs,int)
            max_runtime_secs = max_runtime_secs
            self.max_runtime_secs = max_runtime_secs
        else:
            self.max_runtime_secs = max_runtime_secs

        #Make bare minimum build_control
        self.build_control = {
            'stopping_criteria': {
                'max_runtime_secs': self.max_runtime_secs,
            }
        }

        #Add other parameters to build_control if available
        if max_models is not None:
            assert_is_type(max_models,int)
            self.build_control["stopping_criteria"]["max_models"] = max_models
            self.max_models = max_models

        if stopping_metric is not "AUTO":
            assert_is_type(stopping_metric,str)
            self.build_control["stopping_criteria"]["stopping_metric"] = stopping_metric
            self.stopping_metric = stopping_metric
        else:
            self.build_control["stopping_criteria"]["stopping_metric"] = stopping_metric
            self.stopping_metric = stopping_metric

        if stopping_tolerance is not 0.001:
            assert_is_type(stopping_tolerance,float)
            self.build_control["stopping_criteria"]["stopping_tolerance"] = stopping_tolerance
            self.stopping_tolerence = stopping_tolerance
        else:
            self.build_control["stopping_criteria"]["stopping_tolerance"] = stopping_tolerance
            self.stopping_tolerence = stopping_tolerance

        if stopping_rounds is not 3:
            assert_is_type(stopping_rounds,int)
            self.build_control["stopping_criteria"]["stopping_rounds"] = stopping_rounds
            self.stopping_rounds = stopping_rounds
        else:
            self.build_control["stopping_criteria"]["stopping_rounds"] = stopping_rounds
            self.stopping_rounds = stopping_rounds

        if seed is not None:
            assert_is_type(seed,int)
            self.build_control["stopping_criteria"]["seed"] = seed
            self.seed = seed

        #Set project name if provided. If None, then we set in .train() to "automl_" + training_frame.frame_id
        if project_name is not None:
            assert_is_type(project_name,str)
            self.build_control["project_name"] = project_name
            self.project_name = project_name
        else:
            self.project_name = None

        self._job = None
        self._automl_key = None
        self._leader_id = None
        self._leaderboard = None
Esempio n. 47
0
 def _get_params(self):
     res = h2o.api("GET /99/AutoML/" + self._automl_key)
     return res
Esempio n. 48
0
    def train(self, x = None, y = None, training_frame = None, fold_column = None, 
              weights_column = None, validation_frame = None, leaderboard_frame=None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param leaderboard_frame: H2OFrame with test data to be scored on in the leaderboard.

        :returns: An H2OAutoML object.

        :examples:
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch H2OAutoML
        >>> aml.train(y=y, training_frame=training_frame)
        """
        ncols = training_frame.ncols
        names = training_frame.names

        #Minimal required arguments are training_frame and y (response)
        if y is None:
            raise ValueError('The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.')
        else:
            assert_is_type(y,int,str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            input_spec = {
                'response_column': y,
            }

        if training_frame is None:
            raise ValueError('The training frame is not set!')
        else:
            assert_is_type(training_frame, H2OFrame)
            input_spec['training_frame'] = training_frame.frame_id

        if fold_column is not None:
            assert_is_type(fold_column,int,str)
            input_spec['fold_column'] = fold_column

        if weights_column is not None:
            assert_is_type(weights_column,int,str)
            input_spec['weights_column'] = weights_column

        if validation_frame is not None:
            assert_is_type(training_frame, H2OFrame)
            input_spec['validation_frame'] = validation_frame.frame_id

        if leaderboard_frame is not None:
            assert_is_type(training_frame, H2OFrame)
            input_spec['leaderboard_frame'] = leaderboard_frame.frame_id

        if x is not None:
            assert_is_type(x,list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
            x = list(xset)
            ignored_columns = set(names) - {y} - set(x)
            if fold_column is not None: ignored_columns = ignored_columns.remove(fold_column)
            if weights_column is not None: ignored_columns = ignored_columns.remove(weights_column)
            if ignored_columns is not None:
                input_spec['ignored_columns'] = list(ignored_columns)

        automl_build_params = dict(input_spec = input_spec)

        # NOTE: if the user hasn't specified some block of parameters don't send them!
        # This lets the back end use the defaults.
        automl_build_params['build_control'] = self.build_control

        resp = h2o.api('POST /99/AutoMLBuilder', json=automl_build_params)
        if 'job' not in resp:
            print("Exception from the back end: ")
            print(resp)
            return

        self._job = H2OJob(resp['job'], "AutoML")
        self._automl_key = self._job.dest_key
        self._job.poll()
        self._fetch()
Esempio n. 49
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False):
        """
        Train the H2O model.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
        :param bool verbose: Print scoring history to stdout. Defaults to False.
        """

        assert_is_type(training_frame, None, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)

        if self._requires_training_frame() and training_frame is None:
            raise H2OValueError("Training frame required for %s algorithm, but none was given.", self.algo)

        training_frame_exists = training_frame is None
        if training_frame_exists:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"})
        if not training_frame_exists:
            names = training_frame.names
            ncols = training_frame.ncols

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if not training_frame_exists:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)

            parms["offset_column"] = offset_column
            parms["fold_column"] = fold_column
            parms["weights_column"] = weights_column

        if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None: raise ValueError("Missing response")

        # Step 3
        if not training_frame_exists:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        if not training_frame_exists:
            ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else
                                 [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])

        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]):
            raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(verbose_model_scoring_history=verbose)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Esempio n. 50
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_type(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not is_type(x, list, tuple): x = [x]
        if is_type(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if
                       kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  # unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
        rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None

        grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()

        grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key))
        failure_messages_stacks = ""
        error_index = 0
        if len(grid_json["failure_details"]) > 0:
            print("Errors/Warnings building gridsearch model\n")
# will raise error if no grid model is returned, store error messages here

            for error_message in grid_json["failure_details"]:
                if isinstance(grid_json["failed_params"][error_index], dict):
                    for h_name in grid_json['hyper_names']:
                        print("Hyper-parameter: {0}, {1}".format(h_name,
                                                                 grid_json['failed_params'][error_index][h_name]))

                if len(grid_json["failure_stack_traces"]) > error_index:
                    print("failure_details: {0}\nfailure_stack_traces: "
                          "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index]))
                    failure_messages_stacks += error_message+'\n'
                error_index += 1

        self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]
        for model in self.models:
            model._estimator_type = self.model._estimator_type

        # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        # sometimes no model is returned due to bad parameter values provided by the user.
        if len(grid_json['model_ids']) > 0:
            first_model_json = h2o.api("GET /%d/Models/%s" %
                                       (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0]
            self._resolve_grid(grid.dest_key, grid_json, first_model_json)
        else:
            if len(failure_messages_stacks)>0:
                raise ValueError(failure_messages_stacks)
            else:
                raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
Esempio n. 51
0
    def __init__(self,
                 nfolds=5,
                 balance_classes=False,
                 class_sampling_factors=None,
                 max_after_balance_size=5.0,
                 max_runtime_secs=3600,
                 max_runtime_secs_per_model=None,
                 max_models=None,
                 stopping_metric="AUTO",
                 stopping_tolerance=None,
                 stopping_rounds=3,
                 seed=None,
                 project_name=None,
                 exclude_algos=None,
                 include_algos=None,
                 modeling_plan=None,
                 keep_cross_validation_predictions=False,
                 keep_cross_validation_models=False,
                 keep_cross_validation_fold_assignment=False,
                 sort_metric="AUTO",
                 export_checkpoints_dir=None,
                 verbosity="warn"):
        """
        Create a new H2OAutoML instance.
        
        :param int nfolds: Number of folds for k-fold cross-validation. Defaults to ``5``. Use ``0`` to disable cross-validation; this will also 
          disable Stacked Ensemble (thus decreasing the overall model performance).
        :param bool balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data).  Defaults to ``False``.
        :param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling
          factors will be automatically computed to obtain class balance during training. Requires ``balance_classes``.
        :param float max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be less than 1.0).
          Requires ``balance_classes``. Defaults to ``5.0``.
        :param int max_runtime_secs: This argument controls how long the AutoML run will execute. Defaults to ``3600`` seconds (1 hour).
        :param int max_runtime_secs_per_model: This argument controls the max time the AutoML run will dedicate to each individual model. Defaults to `0` (disabled).
        :param int max_models: Specify the maximum number of models to build in an AutoML run. (Does not include the Stacked Ensemble models.)
        :param str stopping_metric: Specifies the metric to use for early stopping. Defaults to ``"AUTO"``.
          The available options are:
          ``"AUTO"`` (This defaults to ``"logloss"`` for classification, ``"deviance"`` for regression),
          ``"deviance"``, ``"logloss"``, ``"mse"``, ``"rmse"``, ``"mae"``, ``"rmsle"``, ``"auc"``, ``"lift_top_group"``,
          ``"misclassification"``, ``"mean_per_class_error"``, ``"r2"``.
        :param float stopping_tolerance: This option specifies the relative tolerance for the metric-based stopping
          to stop the AutoML run if the improvement is less than this value. This value defaults to ``0.001``
          if the dataset is at least 1 million rows; otherwise it defaults to a value determined by the size of the dataset
          and the non-NA-rate.  In that case, the value is computed as 1/sqrt(nrows * non-NA-rate).
        :param int stopping_rounds: This argument stops training new models in the AutoML run when the option selected for
          stopping_metric doesn't improve for the specified number of models, based on a simple moving average.
          To disable this feature, set it to ``0``. Defaults to ``3`` and must be an non-negative integer.
        :param int seed: Set a seed for reproducibility. AutoML can only guarantee reproducibility if ``max_models`` or
          early stopping is used because ``max_runtime_secs`` is resource limited, meaning that if the resources are
          not the same between runs, AutoML may be able to train more models on one run vs another.  Defaults to ``None``.
        :param str project_name: Character string to identify an AutoML project. Defaults to ``None``, which means
          a project name will be auto-generated based on the training frame ID.  More models can be trained on an
          existing AutoML project by specifying the same project name in muliple calls to the AutoML function
          (as long as the same training frame is used in subsequent runs).
        :param exclude_algos: List of character strings naming the algorithms to skip during the model-building phase. 
          An example use is ``exclude_algos = ["GLM", "DeepLearning", "DRF"]``, and the full list of options is: ``"DRF"`` 
          (Random Forest and Extremely-Randomized Trees), ``"GLM"``, ``"XGBoost"``, ``"GBM"``, ``"DeepLearning"`` and ``"StackedEnsemble"``. 
          Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional.
        :param include_algos: List of character strings naming the algorithms to restrict to during the model-building phase.
          This can't be used in combination with `exclude_algos` param.
          Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional.
        :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation predictions.
          This needs to be set to ``True`` if running the same AutoML object for repeated runs because CV predictions are required to build 
          additional Stacked Ensemble models in AutoML. This option defaults to ``False``.
        :param keep_cross_validation_models: Whether to keep the cross-validated models. Keeping cross-validation models may consume 
          significantly more memory in the H2O cluster. Defaults to ``False``.
        :param keep_cross_validation_fold_assignment: Whether to keep fold assignments in the models. Deleting them will save memory 
          in the H2O cluster. This option defaults to ``False``.
        :param sort_metric: Metric to sort the leaderboard by. Defaults to ``"AUTO"`` (This defaults to ``auc`` for binomial classification, 
          ``mean_per_class_error`` for multinomial classification, ``deviance`` for regression). For binomial classification choose between 
          ``auc``, ``"logloss"``, ``"mean_per_class_error"``, ``"rmse"``, ``"mse"``.  For regression choose between ``"deviance"``, ``"rmse"``, 
          ``"mse"``, ``"mae"``, ``"rmlse"``. For multinomial classification choose between ``"mean_per_class_error"``, ``"logloss"``, ``"rmse"``, ``"mse"``.
        :param export_checkpoints_dir: Path to a directory where every model will be stored in binary form.
        :param verbosity: Verbosity of the backend messages printed during training.
            Available options are None (live log disabled), 'debug', 'info' or 'warn'. Defaults to 'warn'.
        """
        # Check if H2O jar contains AutoML
        try:
            h2o.api("GET /3/Metadata/schemas/AutoMLV99")
        except h2o.exceptions.H2OResponseError as e:
            print(e)
            print("*******************************************************************\n" \
                  "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \
                  "*******************************************************************\n" \
                  "\nVerbose Error Message:")

        
        # Make bare minimum build_control (if max_runtimes_secs is an invalid value, it will catch below)
        self.build_control = {
            'stopping_criteria': {
                'max_runtime_secs': max_runtime_secs,
            }
        }

        # Make bare minimum build_models
        self.build_models = {
            'exclude_algos': None
        }

        # nfolds must be an non-negative integer and not equal to 1:
        if nfolds is not 5:
            assert_is_type(nfolds,int)
        assert nfolds >= 0, "nfolds set to " + str(nfolds) + "; nfolds cannot be negative. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable."
        assert nfolds is not 1, "nfolds set to " + str(nfolds) + "; nfolds = 1 is an invalid value. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable."           
        self.build_control["nfolds"] = nfolds
        self.nfolds = nfolds

        # Pass through to all algorithms
        if balance_classes is True:
            self.build_control["balance_classes"] = balance_classes
            self.balance_classes = balance_classes
        if class_sampling_factors is not None:
            self.build_control["class_sampling_factors"] = class_sampling_factors
            self.class_sampling_factors = class_sampling_factors
        if max_after_balance_size != 5.0:
            assert_is_type(max_after_balance_size, float)
            self.build_control["max_after_balance_size"] = max_after_balance_size
            self.max_after_balance_size = max_after_balance_size

        # If max_runtime_secs is not provided, then it is set to default (3600 secs)
        if max_runtime_secs is not 3600:
            assert_is_type(max_runtime_secs, int)
        self.max_runtime_secs = max_runtime_secs

        assert_is_type(max_runtime_secs_per_model, None, int)
        self.max_runtime_secs_per_model = max_runtime_secs_per_model
        if self.max_runtime_secs_per_model is not None:
            self.build_control["stopping_criteria"]["max_runtime_secs_per_model"] = self.max_runtime_secs_per_model

        # Add other parameters to build_control if available
        if max_models is not None:
            assert_is_type(max_models, int)
            self.build_control["stopping_criteria"]["max_models"] = max_models
        self.max_models = max_models

        if stopping_metric is not "AUTO":
            assert_is_type(stopping_metric, str)
        self.build_control["stopping_criteria"]["stopping_metric"] = stopping_metric
        self.stopping_metric = stopping_metric

        if stopping_tolerance is not None:
            assert_is_type(stopping_tolerance, float)
            self.build_control["stopping_criteria"]["stopping_tolerance"] = stopping_tolerance
        self.stopping_tolerence = stopping_tolerance

        if stopping_rounds is not 3:
            assert_is_type(stopping_rounds, int)
        self.build_control["stopping_criteria"]["stopping_rounds"] = stopping_rounds
        self.stopping_rounds = stopping_rounds    

        if seed is not None:
            assert_is_type(seed, int)
            self.build_control["stopping_criteria"]["seed"] = seed
            self.seed = seed

        # Set project name if provided. If None, then we set in .train() to "automl_" + training_frame.frame_id
        if project_name is not None:
            assert_is_type(project_name, str)
            check_id(project_name, "H2OAutoML")
            self.build_control["project_name"] = project_name
            self.project_name = project_name
        else:
            self.project_name = None

        if exclude_algos is not None:
            assert_is_type(exclude_algos, list)
            for elem in exclude_algos:
                assert_is_type(elem, str)
            self.build_models['exclude_algos'] = exclude_algos

        if include_algos is not None:
            assert exclude_algos is None, "Use either include_algos or exclude_algos, not both."
            assert_is_type(include_algos, list)
            for elem in include_algos:
                assert_is_type(elem, str)
            self.build_models['include_algos'] = include_algos

        if modeling_plan is not None:
            assert_is_type(modeling_plan, list)
            supported_aliases = ['all', 'defaults', 'grids']

            def assert_is_step_def(sd):
                assert 'name' in sd, "each definition must have a 'name' key"
                assert 0 < len(sd) < 3, "each definition must have only 1 or 2 keys: name, name+alias or name+steps"
                assert len(sd) == 1 or 'alias' in sd or 'steps' in sd, "steps definitions support only the following keys: name, alias, steps"
                assert 'alias' not in sd or sd['alias'] in supported_aliases, "alias must be one of %s" % supported_aliases
                assert 'steps' not in sd or (is_type(sd['steps'], list) and all(assert_is_step(s) for s in sd['steps']))

            def assert_is_step(s):
                assert is_type(s, dict), "each step must be a dict with an 'id' key and an optional 'weight' key"
                assert 'id' in s, "each step must have an 'id' key"
                assert len(s) == 1 or ('weight' in s and is_type(s['weight'], int)), "weight must be an integer"
                return True

            plan = []
            for step_def in modeling_plan:
                assert_is_type(step_def, dict, tuple, str)
                if is_type(step_def, dict):
                    assert_is_step_def(step_def)
                    plan.append(step_def)
                elif is_type(step_def, str):
                    plan.append(dict(name=step_def))
                else:
                    assert 0 < len(step_def) < 3
                    assert_is_type(step_def[0], str)
                    name = step_def[0]
                    if len(step_def) == 1:
                        plan.append(dict(name=name))
                    else:
                        assert_is_type(step_def[1], str, list)
                        ids = step_def[1]
                        if is_type(ids, str):
                            assert_is_type(ids, *supported_aliases)
                            plan.append(dict(name=name, alias=ids))
                        else:
                            plan.append(dict(name=name, steps=[dict(id=i) for i in ids]))
            self.build_models['modeling_plan'] = plan


        assert_is_type(keep_cross_validation_predictions, bool)
        self.build_control["keep_cross_validation_predictions"] = keep_cross_validation_predictions

        assert_is_type(keep_cross_validation_models, bool)
        self.build_control["keep_cross_validation_models"] = keep_cross_validation_models

        assert_is_type(keep_cross_validation_fold_assignment, bool)
        self.build_control["keep_cross_validation_fold_assignment"] = self.nfolds != 0 and keep_cross_validation_fold_assignment

        self._job = None
        self._leader_id = None
        self._leaderboard = None
        self._verbosity = verbosity
        self._event_log = None
        self._training_info = None
        self._state_json = None
        if sort_metric == "AUTO":
            self.sort_metric = None
        else:
            self.sort_metric = sort_metric

        if export_checkpoints_dir is not None:
            assert_is_type(export_checkpoints_dir, str)
            self.build_control["export_checkpoints_dir"] = export_checkpoints_dir
Esempio n. 52
0
    def partial_plot(self, data, cols, destination_key=None, nbins=20, plot=True, figsize=(7,10), server=False):
        """
        Create partial dependence plot which gives a graphical depiction of the marginal effect of a variable on the
        response. The effect of a variable is measured in change in the mean response.

        :param H2OFrame data: An H2OFrame object used for scoring and constructing the plot.
        :param cols: Feature(s) for which partial dependence will be calculated.
        :param destination_key: An key reference to the created partial dependence tables in H2O.
        :param nbins: Number of bins used.
        :param plot: A boolean specifying whether to plot partial dependence table.
        :param figsize: Dimension/size of the returning plots, adjust to fit your output cells.
        :param server: ?
        :return: Plot and list of calculated mean response tables for each feature requested.
        """

        if not isinstance(data, h2o.H2OFrame): raise ValueError("data must be an instance of H2OFrame")
        assert_is_type(cols, [str])
        assert_is_type(destination_key, None, str)
        assert_is_type(nbins, int)
        assert_is_type(plot, bool)
        assert_is_type(figsize, (int,int))

        ## Check cols specified exist in frame data
        for xi in cols:
            if not xi in data.names:
                raise H2OValueError("Column %s does not exist in the training frame" % xi)

        kwargs = {}
        kwargs['cols'] = cols
        kwargs['model_id'] = self.model_id
        kwargs['frame_id'] = data.frame_id
        kwargs['nbins'] = nbins
        kwargs['destination_key'] = destination_key

        json = H2OJob(h2o.api("POST /3/PartialDependence/", data=kwargs),  job_type="PartialDependencePlot").poll()
        json = h2o.api("GET /3/PartialDependence/%s" % json.dest_key)

        # Extract partial dependence data from json response
        # pps = json
        pps = json['partial_dependence_data']

        ## Plot partial dependence plots using matplotlib
        if plot:
            plt = _get_matplotlib_pyplot(server)
            if not plt: return

            fig, axs = plt.subplots(len(cols), squeeze=False, figsize=figsize)
            for i, pp in enumerate(pps):
                ## Check weather column was categorical or numeric
                col=cols[i]
                cat=data[col].isfactor()[0]
                if cat:
                    labels = pp[0]
                    x = range(len(labels))
                    y = pp[1]
                    axs[i,0].plot(x, y, 'o')
                    axs[i,0].set_xticks(x)
                    axs[i,0].set_xticklabels(labels)
                    axs[i,0].margins(0.2)
                else:
                    axs[i,0].plot(pp[0], pp[1])
                    axs[i,0].set_xlim(min(pp[0]), max(pp[0]))

                axs[i,0].set_title('Partial Dependence Plot For {}'.format(col))
                axs[i,0].set_xlabel(pp.col_header[0])
                axs[i,0].set_ylabel(pp.col_header[1])
                axs[i,0].xaxis.grid()
                axs[i,0].yaxis.grid()
            if len(col) >1:
                fig.tight_layout(pad = 0.4,w_pad=0.5, h_pad=1.0)

        return pps
Esempio n. 53
0
    def show_status(self, detailed=False):
        """
        Print current cluster status information.

        :param detailed: if True, then also print detailed information about each node.
        """
        if self._retrieved_at + self.REFRESH_INTERVAL < time.time():
            # Info is stale, need to refresh
            new_info = h2o.api("GET /3/Cloud")
            self._fill_from_h2ocluster(new_info)
        ncpus = sum(node["num_cpus"] for node in self.nodes)
        allowed_cpus = sum(node["cpus_allowed"] for node in self.nodes)
        free_mem = sum(node["free_mem"] for node in self.nodes)
        unhealthy_nodes = sum(not node["healthy"] for node in self.nodes)
        status = "locked" if self.locked else "accepting new members"
        if unhealthy_nodes == 0:
            status += ", healthy"
        else:
            status += ", %d nodes are not healthy" % unhealthy_nodes
        H2ODisplay([
            [
                "H2O cluster uptime:",
                get_human_readable_time(self.cloud_uptime_millis)
            ],
            ["H2O cluster version:", self.version],
            [
                "H2O cluster version age:",
                "{} {}".format(self.build_age,
                               ("!!!" if self.build_too_old else ""))
            ],
            ["H2O cluster name:", self.cloud_name],
            ["H2O cluster total nodes:", self.cloud_size],
            ["H2O cluster free memory:",
             get_human_readable_bytes(free_mem)],
            ["H2O cluster total cores:",
             str(ncpus)],
            ["H2O cluster allowed cores:",
             str(allowed_cpus)],
            ["H2O cluster status:", status],
            ["H2O connection url:",
             h2o.connection().base_url],
            ["H2O connection proxy:",
             h2o.connection().proxy],
            ["H2O internal security:", self.internal_security_enabled],
            ["Python version:",
             "%d.%d.%d %s" % tuple(sys.version_info[:4])],
        ])

        if detailed:
            keys = [
                "h2o", "healthy", "last_ping", "num_cpus", "sys_load",
                "mem_value_size", "free_mem", "pojo_mem", "swap_mem",
                "free_disk", "max_disk", "pid", "num_keys", "tcps_active",
                "open_fds", "rpcs_active"
            ]
            header = ["Nodes info:"] + [
                "Node %d" % (i + 1) for i in range(len(self.nodes))
            ]
            table = [[k] for k in keys]
            for node in self.nodes:
                for i, k in enumerate(keys):
                    table[i].append(node[k])
            H2ODisplay(table=table, header=header)
Esempio n. 54
0
 def cancel(self):
     h2o.api("POST /3/Jobs/%s/cancel" % self.job_key)
     self.status = "CANCELLED"
Esempio n. 55
0
    def __init__(self,
                 nfolds=5,
                 balance_classes=False,
                 class_sampling_factors=None,
                 max_after_balance_size=5.0,
                 max_runtime_secs=None,
                 max_runtime_secs_per_model=None,
                 max_models=None,
                 stopping_metric="AUTO",
                 stopping_tolerance=None,
                 stopping_rounds=3,
                 seed=None,
                 project_name=None,
                 exclude_algos=None,
                 include_algos=None,
                 exploitation_ratio=0,
                 modeling_plan=None,
                 monotone_constraints=None,
                 algo_parameters=None,
                 keep_cross_validation_predictions=False,
                 keep_cross_validation_models=False,
                 keep_cross_validation_fold_assignment=False,
                 sort_metric="AUTO",
                 export_checkpoints_dir=None,
                 verbosity="warn"):
        """
        Create a new H2OAutoML instance.
        
        :param int nfolds: Number of folds for k-fold cross-validation. Defaults to ``5``. Use ``0`` to disable cross-validation; this will also 
          disable Stacked Ensemble (thus decreasing the overall model performance).
        :param bool balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data).  Defaults to ``False``.
        :param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling
          factors will be automatically computed to obtain class balance during training. Requires ``balance_classes``.
        :param float max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be less than 1.0).
          Requires ``balance_classes``. Defaults to ``5.0``.
        :param int max_runtime_secs: This argument specifies the maximum time that the AutoML process will run for, prior to training the final Stacked Ensemble models. If neither ``max_runtime_secs`` nor ``max_models`` are specified by the user, then ``max_runtime_secs`` defaults to 3600 seconds (1 hour).
        :param int max_runtime_secs_per_model: This argument controls the max time the AutoML run will dedicate to each individual model. Defaults to `0` (disabled).
        :param int max_models: Specify the maximum number of models to build in an AutoML run. Not limited by default. (Does not include the Stacked Ensemble models.)
        :param str stopping_metric: Specifies the metric to use for early stopping. Defaults to ``"AUTO"``.
          The available options are:
          ``"AUTO"`` (This defaults to ``"logloss"`` for classification, ``"deviance"`` for regression),
          ``"deviance"``, ``"logloss"``, ``"mse"``, ``"rmse"``, ``"mae"``, ``"rmsle"``, ``"auc"``, ``aucpr``, ``"lift_top_group"``,
          ``"misclassification"``, ``"mean_per_class_error"``, ``"r2"``.
        :param float stopping_tolerance: This option specifies the relative tolerance for the metric-based stopping
          to stop the AutoML run if the improvement is less than this value. This value defaults to ``0.001``
          if the dataset is at least 1 million rows; otherwise it defaults to a value determined by the size of the dataset
          and the non-NA-rate.  In that case, the value is computed as 1/sqrt(nrows * non-NA-rate).
        :param int stopping_rounds: This argument stops training new models in the AutoML run when the option selected for
          stopping_metric doesn't improve for the specified number of models, based on a simple moving average.
          To disable this feature, set it to ``0``. Defaults to ``3`` and must be an non-negative integer.
        :param int seed: Set a seed for reproducibility. AutoML can only guarantee reproducibility if ``max_models`` or
          early stopping is used because ``max_runtime_secs`` is resource limited, meaning that if the resources are
          not the same between runs, AutoML may be able to train more models on one run vs another.  Defaults to ``None``.
        :param str project_name: Character string to identify an AutoML project. Defaults to ``None``, which means
          a project name will be auto-generated based on the training frame ID.  More models can be trained on an
          existing AutoML project by specifying the same project name in muliple calls to the AutoML function
          (as long as the same training frame is used in subsequent runs).
        :param exclude_algos: List of character strings naming the algorithms to skip during the model-building phase. 
          An example use is ``exclude_algos = ["GLM", "DeepLearning", "DRF"]``, and the full list of options is: ``"DRF"`` 
          (Random Forest and Extremely-Randomized Trees), ``"GLM"``, ``"XGBoost"``, ``"GBM"``, ``"DeepLearning"`` and ``"StackedEnsemble"``. 
          Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional.
        :param include_algos: List of character strings naming the algorithms to restrict to during the model-building phase.
          This can't be used in combination with `exclude_algos` param.
          Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional.
        :param exploitation_ratio: The budget ratio (between 0 and 1) dedicated to the exploitation (vs exploration) phase. By default, the exploitation phase is disabled (exploitation_ratio=0) as this is still experimental; to activate it, it is recommended to try a ratio around 0.1. Note that the current exploitation phase only tries to fine-tune the best XGBoost and the best GBM found during exploration.
        :param modeling_plan: List of modeling steps to be used by the AutoML engine (they may not all get executed, depending on other constraints).
          Defaults to None (Expert usage only).
        :param monotone_constraints: Dict representing monotonic constraints.
          Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint.
        :param algo_parameters: Dict of ``param_name=param_value`` to be passed to internal models. Defaults to none (Expert usage only).
          By default, params are set only to algorithms accepting them, and ignored by others.
          Only following parameters are currently allowed: ``"monotone_constraints"``.
        :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation predictions.
          This needs to be set to ``True`` if running the same AutoML object for repeated runs because CV predictions are required to build 
          additional Stacked Ensemble models in AutoML. This option defaults to ``False``.
        :param keep_cross_validation_models: Whether to keep the cross-validated models. Keeping cross-validation models may consume 
          significantly more memory in the H2O cluster. Defaults to ``False``.
        :param keep_cross_validation_fold_assignment: Whether to keep fold assignments in the models. Deleting them will save memory 
          in the H2O cluster. This option defaults to ``False``.
        :param sort_metric: Metric to sort the leaderboard by. Defaults to ``"AUTO"`` (This defaults to ``auc`` for binomial classification, 
          ``mean_per_class_error`` for multinomial classification, ``deviance`` for regression). For binomial classification choose between 
          ``auc``, ``aucpr``, ``"logloss"``, ``"mean_per_class_error"``, ``"rmse"``, ``"mse"``.  For regression choose between ``"deviance"``, ``"rmse"``, 
          ``"mse"``, ``"mae"``, ``"rmlse"``. For multinomial classification choose between ``"mean_per_class_error"``, ``"logloss"``, ``"rmse"``, ``"mse"``.
        :param export_checkpoints_dir: Path to a directory where every model will be stored in binary form.
        :param verbosity: Verbosity of the backend messages printed during training.
            Available options are None (live log disabled), 'debug', 'info' or 'warn'. Defaults to 'warn'.
        """
        # Check if H2O jar contains AutoML
        try:
            h2o.api("GET /3/Metadata/schemas/AutoMLV99")
        except h2o.exceptions.H2OResponseError as e:
            print(e)
            print("*******************************************************************\n" \
                  "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \
                  "*******************************************************************\n" \
                  "\nVerbose Error Message:")

        self._job = None
        self._leader_id = None
        self._leaderboard = None
        self._verbosity = verbosity
        self._event_log = None
        self._training_info = None
        self._state_json = None
        self._build_resp = None  # contains all the actual parameters used on backend

        # Make bare minimum params containers
        self.build_control = dict(stopping_criteria=dict())
        self.build_models = dict()
        self.input_spec = dict()

        # build_control params #

        assert_is_type(project_name, None, str)
        check_id(project_name, "H2OAutoML")
        self._project_name = self.build_control["project_name"] = project_name

        assert_is_type(nfolds, int)
        assert nfolds >= 0, "nfolds set to " + str(
            nfolds
        ) + "; nfolds cannot be negative. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable."
        assert nfolds is not 1, "nfolds set to " + str(
            nfolds
        ) + "; nfolds = 1 is an invalid value. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable."
        self.nfolds = self.build_control["nfolds"] = nfolds

        assert_is_type(balance_classes, bool)
        self.balance_classes = self.build_control[
            "balance_classes"] = balance_classes

        assert_is_type(class_sampling_factors, None, [numeric])
        self.class_sampling_factors = self.build_control[
            "class_sampling_factors"] = class_sampling_factors

        assert_is_type(max_after_balance_size, None, numeric)
        self.max_after_balance_size = self.build_control[
            "max_after_balance_size"] = max_after_balance_size

        assert_is_type(keep_cross_validation_models, bool)
        self.keep_cross_validation_models = self.build_control[
            "keep_cross_validation_models"] = keep_cross_validation_models

        assert_is_type(keep_cross_validation_fold_assignment, bool)
        self.keep_cross_validation_fold_assignment = self.build_control[
            "keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment

        assert_is_type(keep_cross_validation_predictions, bool)
        self.keep_cross_validation_predictions = self.build_control[
            "keep_cross_validation_predictions"] = keep_cross_validation_predictions

        assert_is_type(export_checkpoints_dir, None, str)
        self.export_checkpoints_dir = self.build_control[
            "export_checkpoints_dir"] = export_checkpoints_dir

        # stopping criteria params #

        assert_is_type(max_runtime_secs, None, int)
        self.max_runtime_secs = self.build_control['stopping_criteria'][
            'max_runtime_secs'] = max_runtime_secs

        assert_is_type(max_runtime_secs_per_model, None, int)
        self.max_runtime_secs_per_model = self.build_control[
            "stopping_criteria"][
                "max_runtime_secs_per_model"] = max_runtime_secs_per_model

        assert_is_type(max_models, None, int)
        self.max_models = self.build_control["stopping_criteria"][
            "max_models"] = max_models

        assert_is_type(stopping_metric, None, str)
        self.stopping_metric = self.build_control["stopping_criteria"][
            "stopping_metric"] = stopping_metric

        assert_is_type(stopping_tolerance, None, numeric)
        self.stopping_tolerance = self.build_control["stopping_criteria"][
            "stopping_tolerance"] = stopping_tolerance

        assert_is_type(stopping_rounds, None, int)
        self.stopping_rounds = self.build_control["stopping_criteria"][
            "stopping_rounds"] = stopping_rounds

        assert_is_type(seed, None, int)
        self.seed = self.build_control["stopping_criteria"]["seed"] = seed

        # build models params #

        assert_is_type(exclude_algos, None, [str])
        self.exclude_algos = self.build_models['exclude_algos'] = exclude_algos

        assert_is_type(include_algos, None, [str])
        if include_algos is not None:
            assert exclude_algos is None, "Use either include_algos or exclude_algos, not both."
        self.include_algos = self.build_models['include_algos'] = include_algos

        assert_is_type(exploitation_ratio, None, numeric)
        self.exploitation_ratio = self.build_models[
            'exploitation_ratio'] = exploitation_ratio

        assert_is_type(modeling_plan, None, list)
        if modeling_plan is not None:
            supported_aliases = ['all', 'defaults', 'grids']

            def assert_is_step_def(sd):
                assert 'name' in sd, "each definition must have a 'name' key"
                assert 0 < len(
                    sd
                ) < 3, "each definition must have only 1 or 2 keys: name, name+alias or name+steps"
                assert len(
                    sd
                ) == 1 or 'alias' in sd or 'steps' in sd, "steps definitions support only the following keys: name, alias, steps"
                assert 'alias' not in sd or sd[
                    'alias'] in supported_aliases, "alias must be one of %s" % supported_aliases
                assert 'steps' not in sd or (is_type(sd['steps'], list)
                                             and all(
                                                 assert_is_step(s)
                                                 for s in sd['steps']))

            def assert_is_step(s):
                assert is_type(
                    s, dict
                ), "each step must be a dict with an 'id' key and an optional 'weight' key"
                assert 'id' in s, "each step must have an 'id' key"
                assert len(s) == 1 or ('weight' in s and is_type(
                    s['weight'], int)), "weight must be an integer"
                return True

            plan = []
            for step_def in modeling_plan:
                assert_is_type(step_def, dict, tuple, str)
                if is_type(step_def, dict):
                    assert_is_step_def(step_def)
                    plan.append(step_def)
                elif is_type(step_def, str):
                    plan.append(dict(name=step_def))
                else:
                    assert 0 < len(step_def) < 3
                    assert_is_type(step_def[0], str)
                    name = step_def[0]
                    if len(step_def) == 1:
                        plan.append(dict(name=name))
                    else:
                        assert_is_type(step_def[1], str, list)
                        ids = step_def[1]
                        if is_type(ids, str):
                            assert_is_type(ids, *supported_aliases)
                            plan.append(dict(name=name, alias=ids))
                        else:
                            plan.append(
                                dict(name=name,
                                     steps=[dict(id=i) for i in ids]))
            self.modeling_plan = self.build_models['modeling_plan'] = plan
        else:
            self.modeling_plan = None

        assert_is_type(algo_parameters, None, dict)
        if monotone_constraints is not None:
            if algo_parameters is None:
                algo_parameters = {}
            self.monotone_constraints = algo_parameters[
                'monotone_constraints'] = monotone_constraints
        else:
            self.monotone_constraints = None

        assert_is_type(algo_parameters, None, dict)
        if algo_parameters is not None:
            algo_parameters_json = []
            for k, v in algo_parameters.items():
                scope, __, name = k.partition('__')
                if len(name) == 0:
                    name, scope = scope, 'any'
                value = [
                    dict(key=k, value=v) for k, v in v.items()
                ] if isinstance(
                    v, dict
                ) else v  # we can't use stringify_dict here as this will be converted into a JSON string
                algo_parameters_json.append(
                    dict(scope=scope, name=name, value=value))

            self.algo_parameters = self.build_models[
                'algo_parameters'] = algo_parameters_json
        else:
            self.algo_parameters = None

        # input spec params #

        assert_is_type(sort_metric, None, str)
        self.sort_metric = self.input_spec['sort_metric'] = sort_metric
Esempio n. 56
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None):
        """
        Train the H2O model.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
        """
        assert_is_type(training_frame, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        algo = self.algo
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm", "word2vec"})
        ncols = training_frame.ncols
        names = training_frame.names
        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        elif y is not None:
            raise H2OValueError("y should not be provided for an unsupervised model")
        assert_is_type(y, str, None)
        ignored_columns_set = set()
        if ignored_columns is not None:
            if x is not None:
                raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
            for ic in ignored_columns:
                if is_type(ic, int):
                    if not (-ncols <= ic < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % ic)
                    ignored_columns_set.add(names[ic])
                else:
                    if ic not in names:
                        raise H2OValueError("Column %s not in the training frame" % ic)
                    ignored_columns_set.add(ic)
        if x is None:
            xset = set(names) - {y} - ignored_columns_set
        else:
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
        x = list(xset)

        parms["offset_column"] = offset_column
        parms["fold_column"] = fold_column
        parms["weights_column"] = weights_column
        parms["max_runtime_secs"] = max_runtime_secs
        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"pca", "svd", "kmeans", "glrm", "word2vec"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None: raise ValueError("Missing response")

        # Step 3
        parms["training_frame"] = training_frame
        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        offset = parms["offset_column"]
        folds = parms["fold_column"]
        weights = parms["weights_column"]
        ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
        parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms),
                       job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll()
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Esempio n. 57
0
    def train_segments(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
                       weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
                       segments=None, segment_models_id=None, parallelism=1, verbose=False):
        """
        Trains H2O model for each segment (subpopulation) of the training dataset.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for each model training. Use 0 to disable.
            Please note that regardless of how this parameter is set, a model will be built for each input segment.
            This parameter only affects individual model training.
        :param segments: A list of columns to segment-by. H2O will group the training (and validation) dataset
            by the segment-by columns and train a separate model for each segment (group of rows).
            As an alternative to providing a list of columns, users can also supply an explicit enumeration of
            segments to build the models for. This enumeration needs to be represented as H2OFrame.
        :param segment_models_id: Identifier for the returned collection of Segment Models. If not specified
            it will be automatically generated.
        :param parallelism: Level of parallelism of the bulk segment models building, it is the maximum number 
            of models each H2O node will be building in parallel.
        :param bool verbose: Enable to print additional information during model building. Defaults to False.

        :examples:

        >>> response = "survived"
        >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
        >>> titanic[response] = titanic[response].asfactor()
        >>> predictors = ["survived","name","sex","age","sibsp","parch","ticket","fare","cabin"]
        >>> train, valid = titanic.split_frame(ratios=[.8], seed=1234)
        >>> from h2o.estimators.gbm import H2OGradientBoostingEstimator
        >>> titanic_gbm = H2OGradientBoostingEstimator(seed=1234)
        >>> titanic_models = titanic_gbm.train_segments(segments=["pclass"],
        ...                                             x=predictors,
        ...                                             y=response,
        ...                                             training_frame=train,
        ...                                             validation_frame=valid)
        >>> titanic_models.as_frame()
        """
        assert_is_type(segments, None, H2OFrame, [str])
        assert_is_type(verbose, bool)
        assert_is_type(segment_models_id, None, str)
        assert_is_type(parallelism, int)

        if segments is None:
            raise H2OValueError("Parameter segments was not specified. Please provide either a list of columns to "
                                "segment-by or an explicit list of segments to build models for.")

        parms = self._make_parms(x=x, y=y, training_frame=training_frame, offset_column=offset_column,
                                 fold_column=fold_column, weights_column=weights_column,
                                 validation_frame=validation_frame, max_runtime_secs=max_runtime_secs,
                                 ignored_columns=ignored_columns, model_id=None, verbose=verbose)

        if isinstance(segments, H2OFrame):
            parms["segments"] = H2OEstimator._keyify(segments)
        else:
            parms["segment_columns"] = segments
        if segment_models_id:
            parms["segment_models_id"] = segment_models_id
        parms["parallelism"] = parallelism

        rest_ver = self._get_rest_version(parms)
        train_segments_response = h2o.api("POST /%d/SegmentModelsBuilders/%s" % (rest_ver, self.algo), data=parms)
        job = H2OJob(train_segments_response, job_type=(self.algo + " Segment Models Build"))
        job.poll()
        return H2OSegmentModels(job.dest_key)
Esempio n. 58
0
    def __init__(self,
                 nfolds=5,
                 max_runtime_secs=3600,
                 max_models=None,
                 stopping_metric="AUTO",
                 stopping_tolerance=None,
                 stopping_rounds=3,
                 seed=None,
                 project_name=None):

        # Check if H2O jar contains AutoML
        try:
            h2o.api("GET /3/Metadata/schemas/AutoMLV99")
        except h2o.exceptions.H2OResponseError as e:
            print(e)
            print("*******************************************************************\n" \
                  "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \
                  "*******************************************************************\n" \
                  "\nVerbose Error Message:")

        
        # Make bare minimum build_control (if max_runtimes_secs is an invalid value, it will catch below)
        self.build_control = {
            'stopping_criteria': {
                'max_runtime_secs': max_runtime_secs,
            }
        }

        # nfolds must be an non-negative integer and not equal to 1:
        if nfolds is not 5:
            assert_is_type(nfolds,int)
        assert nfolds >= 0, "nfolds set to " + str(nfolds) + "; nfolds cannot be negative. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable."
        assert nfolds is not 1, "nfolds set to " + str(nfolds) + "; nfolds = 1 is an invalid value. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable."           
        self.build_control["nfolds"] = nfolds 
        self.nfolds = nfolds   

        # If max_runtime_secs is not provided, then it is set to default (3600 secs)
        if max_runtime_secs is not 3600:
            assert_is_type(max_runtime_secs,int)
        self.max_runtime_secs = max_runtime_secs

        # Add other parameters to build_control if available
        if max_models is not None:
            assert_is_type(max_models,int)
            self.build_control["stopping_criteria"]["max_models"] = max_models
        self.max_models = max_models

        if stopping_metric is not "AUTO":
            assert_is_type(stopping_metric,str)
        self.build_control["stopping_criteria"]["stopping_metric"] = stopping_metric
        self.stopping_metric = stopping_metric

        if stopping_tolerance is not None:
            assert_is_type(stopping_tolerance,float)
            self.build_control["stopping_criteria"]["stopping_tolerance"] = stopping_tolerance
        self.stopping_tolerence = stopping_tolerance

        if stopping_rounds is not 3:
            assert_is_type(stopping_rounds,int)
        self.build_control["stopping_criteria"]["stopping_rounds"] = stopping_rounds
        self.stopping_rounds = stopping_rounds    

        if seed is not None:
            assert_is_type(seed,int)
            self.build_control["stopping_criteria"]["seed"] = seed
            self.seed = seed

        # Set project name if provided. If None, then we set in .train() to "automl_" + training_frame.frame_id
        if project_name is not None:
            assert_is_type(project_name,str)
            self.build_control["project_name"] = project_name
            self.project_name = project_name
        else:
            self.project_name = None
    

        self._job = None
        self._automl_key = None
        self._leader_id = None
        self._leaderboard = None
Esempio n. 59
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              fold_column=None,
              weights_column=None,
              validation_frame=None,
              leaderboard_frame=None,
              blending_frame=None):
        """
        Begins an AutoML task, a background task that automatically builds a number of models
        with various algorithms and tracks their performance in a leaderboard. At any point 
        in the process you may use H2O's performance or prediction functions on the resulting 
        models.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param fold_column: The name or index of the column in training_frame that holds per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds per-row weights.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold_column or weights_column).
        :param validation_frame: H2OFrame with validation data. This argument is ignored unless the user sets 
            nfolds = 0. If cross-validation is turned off, then a validation frame can be specified and used 
            for early stopping of individual models and early stopping of the grid searches.  By default and 
            when nfolds > 1, cross-validation metrics will be used for early stopping and thus validation_frame will be ignored.
        :param leaderboard_frame: H2OFrame with test data for scoring the leaderboard.  This is optional and
            if this is set to None (the default), then cross-validation metrics will be used to generate the leaderboard 
            rankings instead.
        :param blending_frame: H2OFrame used to train the the metalearning algorithm in Stacked Ensembles (instead of relying on cross-validated predicted values).
            This is optional, but when provided, it is also recommended to disable cross validation 
            by setting `nfolds=0` and to provide a leaderboard frame for scoring purposes.

        :returns: An H2OAutoML object.

        :examples:
        
        >>> # Set up an H2OAutoML object
        >>> aml = H2OAutoML(max_runtime_secs=30)
        >>> # Launch an AutoML run
        >>> aml.train(y=y, training_frame=train)
        """
        # Minimal required arguments are training_frame and y (response)
        self.training_frame = training_frame

        ncols = self.training_frame.ncols
        names = self.training_frame.names

        if y is None and self.response_column is None:
            raise H2OValueError(
                'The response column (y) is not set; please set it to the name of the column that you are trying to predict in your data.'
            )
        elif y is not None:
            assert_is_type(y, int, str)
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError(
                        "Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError(
                        "Column %s does not exist in the training frame" % y)
            self.response_column = y

        self.fold_column = fold_column
        self.weights_column = weights_column

        self.validation_frame = validation_frame
        self.leaderboard_frame = leaderboard_frame
        self.blending_frame = blending_frame

        if x is not None:
            assert_is_type(x, list)
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError(
                            "Column %d does not exist in the training frame" %
                            xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError(
                            "Column %s not in the training frame" % xi)
                    xset.add(xi)
            ignored_columns = set(names) - xset
            for col in [y, fold_column, weights_column]:
                if col is not None and col in ignored_columns:
                    ignored_columns.remove(col)
            if ignored_columns is not None:
                self.input_spec['ignored_columns'] = list(ignored_columns)

        def clean_params(params):
            return ({
                k: clean_params(v)
                for k, v in params.items() if v is not None
            } if isinstance(params, dict) else H2OEstimator._keyify(params))

        automl_build_params = clean_params(
            dict(
                build_control=self.build_control,
                build_models=self.build_models,
                input_spec=self.input_spec,
            ))

        resp = self._build_resp = h2o.api('POST /99/AutoMLBuilder',
                                          json=automl_build_params)
        if 'job' not in resp:
            raise H2OResponseError(
                "Backend failed to build the AutoML job: {}".format(resp))

        if not self.project_name:
            self.project_name = resp['build_control']['project_name']
        self.__frozen = True

        self._job = H2OJob(resp['job'], "AutoML")
        poll_updates = ft.partial(self._poll_training_updates,
                                  verbosity=self._verbosity,
                                  state={})
        try:
            self._job.poll(poll_updates=poll_updates)
        finally:
            poll_updates(self._job, 1)

        self._fetch()
        return self.leader
Esempio n. 60
0
 def _get_params(self):
     res = h2o.api("GET /99/AutoML/" + self.project_name)
     return res