Beispiel #1
0
 def _keyify_if_h2oframe(item):
     if isinstance(item, H2OFrame):
         return item.frame_id
     elif isinstance(item, list) and all(i is None or isinstance(i, H2OFrame) for i in item):
         return [quoted(i) if i is None else quoted(i.frame_id) for i in item]
     else:
         return item
Beispiel #2
0
 def _keyify_if_h2oframe(item):
     if isinstance(item, H2OFrame):
         return item.frame_id
     elif isinstance(item, list) and all(i is None or isinstance(i, H2OFrame) for i in item):
         return [quoted(i) if i is None else quoted(i.frame_id) for i in item]
     else:
         return item
Beispiel #3
0
def parse_raw(setup, id=None, first_line_is_header=(-1, 0, 1)):
    """Used in conjunction with lazy_import and parse_setup in order to make alterations
    before parsing.

    Parameters
    ----------
      setup : dict
        Result of h2o.parse_setup

      id : str, optional
        An id for the frame.

      first_line_is_header : int, optional
        -1,0,1 if the first line is to be used as the header

    Returns
    -------
      H2OFrame
    """
    if id: setup["destination_frame"] = quoted(id).replace("%", ".").replace("&", ".")
    if first_line_is_header != (-1, 0, 1):
        if first_line_is_header not in (-1, 0, 1): raise ValueError("first_line_is_header should be -1, 0, or 1")
        setup["check_header"] = first_line_is_header
    fr = H2OFrame()
    fr._parse_raw(setup)
    return fr
Beispiel #4
0
 def _keyify(item):
     if isinstance(item, Keyed):
         return item.key
     elif isinstance(item, list) and any(isinstance(i, Keyed) for i in item):
         return [quoted(H2OEstimator._keyify(i)) for i in item]
     else:
         return item
Beispiel #5
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_type(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not is_type(x, list, tuple): x = [x]
        if is_type(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if
                       kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  # unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
        rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None

        grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()

        grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key))
        failure_messages_stacks = ""
        error_index = 0
        if len(grid_json["failure_details"]) > 0:
            print("Errors/Warnings building gridsearch model\n")
# will raise error if no grid model is returned, store error messages here

            for error_message in grid_json["failure_details"]:
                if isinstance(grid_json["failed_params"][error_index], dict):
                    for h_name in grid_json['hyper_names']:
                        print("Hyper-parameter: {0}, {1}".format(h_name,
                                                                 grid_json['failed_params'][error_index][h_name]))

                if len(grid_json["failure_stack_traces"]) > error_index:
                    print("failure_details: {0}\nfailure_stack_traces: "
                          "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index]))
                    failure_messages_stacks += error_message+'\n'
                error_index += 1

        self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]

        # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        # sometimes no model is returned due to bad parameter values provided by the user.
        if len(grid_json['model_ids']) > 0:
            first_model_json = h2o.api("GET /%d/Models/%s" %
                                       (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0]
            self._resolve_grid(grid.dest_key, grid_json, first_model_json)
        else:
            if len(failure_messages_stacks)>0:
                raise ValueError(failure_messages_stacks)
            else:
                raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
Beispiel #6
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_type(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not is_type(x, list, tuple): x = [x]
        if is_type(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if
                       kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  # unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
        rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None

        grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()

        grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key))
        failure_messages_stacks = ""
        error_index = 0
        if len(grid_json["failure_details"]) > 0:
            print("Errors/Warnings building gridsearch model\n")
# will raise error if no grid model is returned, store error messages here

            for error_message in grid_json["failure_details"]:
                if isinstance(grid_json["failed_params"][error_index], dict):
                    for h_name in grid_json['hyper_names']:
                        print("Hyper-parameter: {0}, {1}".format(h_name,
                                                                 grid_json['failed_params'][error_index][h_name]))

                if len(grid_json["failure_stack_traces"]) > error_index:
                    print("failure_details: {0}\nfailure_stack_traces: "
                          "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index]))
                    failure_messages_stacks += error_message+'\n'
                error_index += 1

        self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]

        # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        # sometimes no model is returned due to bad parameter values provided by the user.
        if len(grid_json['model_ids']) > 0:
            first_model_json = h2o.api("GET /%d/Models/%s" %
                                       (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0]
            self._resolve_grid(grid.dest_key, grid_json, first_model_json)
        else:
            if len(failure_messages_stacks)>0:
                raise ValueError(failure_messages_stacks)
            else:
                raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
Beispiel #7
0
 def fit(self, fr, **fit_params):
     res = []
     for step in self.steps:
         res.append(step[1].to_rest(step[0]))
     res = "[" + ",".join([quoted(r.replace('"', "'")) for r in res]) + "]"
     j = h2o.api("POST /99/Assembly", data={"steps": res, "frame": fr.frame_id})
     self.id = j["assembly"]["name"]
     return H2OFrame.get_frame(j["result"]["name"])
Beispiel #8
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs["training_frame"] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_type(y, int): y = tframe.names[y]
        if y is not None: kwargs["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(
            set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if ignored_columns == [] else [
            quoted(col) for col in ignored_columns
        ]
        kwargs["interactions"] = (None if "interactions" not in kwargs
                                  or kwargs["interactions"] is None else [
                                      quoted(col)
                                      for col in kwargs["interactions"]
                                  ])
        kwargs = {
            k: H2OEstimator._keyify_if_h2oframe(kwargs[k])
            for k in kwargs
        }
        rest_ver = kwargs.pop(
            "_rest_version") if "_rest_version" in kwargs else 3

        model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" %
                               (rest_ver, self.algo),
                               data=kwargs),
                       job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            return

        model.poll()
        model_json = h2o.api("GET /%d/Models/%s" %
                             (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Beispiel #9
0
 def fit(self, fr, **fit_params):
     res = []
     for step in self.steps:
         res.append(step[1].to_rest(step[0]))
     res = "[" + ",".join([quoted(r.replace('"', "'")) for r in res]) + "]"
     j = h2o.api("POST /99/Assembly",
                 data={
                     "steps": res,
                     "frame": fr.frame_id
                 })
     self.id = j["assembly"]["name"]
     return H2OFrame.get_frame(j["result"]["name"])
Beispiel #10
0
 def fit(self, fr):
     assert_is_type(fr, H2OFrame)
     steps = "[%s]" % ",".join(
         quoted(step[1].to_rest(step[0]).replace('"', "'"))
         for step in self.steps)
     j = h2o.api("POST /99/Assembly",
                 data={
                     "steps": steps,
                     "frame": fr.frame_id
                 })
     self.id = j["assembly"]["name"]
     return H2OFrame.get_frame(j["result"]["name"])
Beispiel #11
0
    def fit(self, fr):
        """
        To perform the munging operations on a frame specified in steps on the frame fr.

        :param fr: H2OFrame where munging operations are to be performed on.
        :return: H2OFrame after munging operations are completed.
        """
        assert_is_type(fr, H2OFrame)
        steps = "[%s]" % ",".join(quoted(step[1].to_rest(step[0]).replace('"', "'")) for step in self.steps)
        j = h2o.api("POST /99/Assembly", data={"steps": steps, "frame": fr.frame_id})
        self.id = j["assembly"]["name"]
        return H2OFrame.get_frame(j["result"]["name"])
Beispiel #12
0
    def fit(self, fr):
        """
        To perform the munging operations on a frame specified in steps on the frame fr.

        :param fr: H2OFrame where munging operations are to be performed on.
        :return: H2OFrame after munging operations are completed.
        """
        assert_is_type(fr, H2OFrame)
        steps = "[%s]" % ",".join(
            quoted(step[1].to_rest(step[0]).replace('"', "'"))
            for step in self.steps)
        j = h2o.api("POST /99/Assembly",
                    data={
                        "steps": steps,
                        "frame": fr.frame_id
                    })
        self.id = j["assembly"]["name"]
        return H2OFrame.get_frame(j["result"]["name"])
Beispiel #13
0
def interaction(data, factors, pairwise, max_factors, min_occurrence, destination_frame=None):
    """
    Categorical Interaction Feature Creation in H2O.
    Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by
    the user.

    Parameters
    ----------
      data : H2OFrame
        the H2OFrame that holds the target categorical columns.

      factors : list
        factors Factor columns (either indices or column names).

      pairwise : bool
        Whether to create pairwise interactions between factors (otherwise create one
        higher-order interaction). Only applicable if there are 3 or more factors.

      max_factors : int
        Max. number of factor levels in pair-wise interaction terms (if enforced, one extra
        catch-all factor will be made)

      min_occurrence : int
        Min. occurrence threshold for factor levels in pair-wise interaction terms

      destination_frame : str
        A string indicating the destination key. If empty, this will be auto-generated by H2O.

    Returns
    -------
      H2OFrame
    """
    factors = [data.names[n] if is_int(n) else n for n in factors]
    parms = {"dest": py_tmp_key(append=h2oconn.session_id) if destination_frame is None else destination_frame,
             "source_frame": data.frame_id,
             "factor_columns": [quoted(f) for f in factors],
             "pairwise": pairwise,
             "max_factors": max_factors,
             "min_occurrence": min_occurrence,
             }
    H2OJob(api("POST /3/Interaction", data=parms), "Interactions").poll()
    return get_frame(parms["dest"])
Beispiel #14
0
    def fit(self, fr):
        """
        To perform the munging operations on a frame specified in steps on the frame fr.

        :param fr: H2OFrame where munging operations are to be performed on.
        :return: H2OFrame after munging operations are completed.

        :examples:

        >>> iris = h2o.load_dataset("iris")
        >>> assembly = H2OAssembly(steps=[("col_select",
        ...                        H2OColSelect(["Sepal.Length",
        ...                        "Petal.Length", "Species"])),
        ...                       ("cos_Sepal.Length",
        ...                        H2OColOp(op=H2OFrame.cos,
        ...                        col="Sepal.Length",
        ...                        inplace=True)),
        ...                       ("str_cnt_Species",
        ...                        H2OColOp(op=H2OFrame.countmatches,
        ...                        col="Species",
        ...                        inplace=False,
        ...                        pattern="s"))])
        >>> fit = assembly.fit(iris)
        >>> fit

        """
        assert_is_type(fr, H2OFrame)
        steps = "[%s]" % ",".join(
            quoted(step[1].to_rest(step[0]).replace('"', "'"))
            for step in self.steps)
        j = h2o.api("POST /99/Assembly",
                    data={
                        "steps": steps,
                        "frame": fr.frame_id
                    })
        self.id = j["assembly"]["name"]
        return H2OFrame.get_frame(j["result"]["name"])
Beispiel #15
0
 def extend_parms(parms):
     if blending_frame is not None:
         parms['blending_frame'] = blending_frame
     if self.metalearner_fold_column is not None:
         parms['ignored_columns'].remove(
             quoted(self.metalearner_fold_column))
Beispiel #16
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None):
        """
        Train the H2O model.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
        """
        assert_is_type(training_frame, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        algo = self.algo
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm", "word2vec"})
        ncols = training_frame.ncols
        names = training_frame.names
        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        elif y is not None:
            raise H2OValueError("y should not be provided for an unsupervised model")
        assert_is_type(y, str, None)
        ignored_columns_set = set()
        if ignored_columns is not None:
            if x is not None:
                raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
            for ic in ignored_columns:
                if is_type(ic, int):
                    if not (-ncols <= ic < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % ic)
                    ignored_columns_set.add(names[ic])
                else:
                    if ic not in names:
                        raise H2OValueError("Column %s not in the training frame" % ic)
                    ignored_columns_set.add(ic)
        if x is None:
            xset = set(names) - {y} - ignored_columns_set
        else:
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
        x = list(xset)

        parms["offset_column"] = offset_column
        parms["fold_column"] = fold_column
        parms["weights_column"] = weights_column
        parms["max_runtime_secs"] = max_runtime_secs
        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"pca", "svd", "kmeans", "glrm", "word2vec"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None: raise ValueError("Missing response")

        # Step 3
        parms["training_frame"] = training_frame
        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        offset = parms["offset_column"]
        folds = parms["fold_column"]
        weights = parms["weights_column"]
        ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
        parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms),
                       job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll()
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Beispiel #17
0
def parse_setup(raw_frames, destination_frame="", header=(-1, 0, 1), separator="", column_names=None,
                column_types=None, na_strings=None):
    """During parse setup, the H2O cluster will make several guesses about the attributes of
    the data. This method allows a user to perform corrective measures by updating the
    returning dictionary from this method. This dictionary is then fed into `parse_raw` to
    produce the H2OFrame instance.

    Parameters
    ----------
      raw_frames : H2OFrame
        A collection of imported file frames

      destination_frame : str, optional
        The unique hex key assigned to the imported file. If none is given, a key will
        automatically be generated.

      parse : bool, optional
        A logical value indicating whether the file should be parsed after import.

      header : int, optional
        -1 means the first line is data, 0 means guess, 1 means first line is header.

      sep : str, optional
        The field separator character. Values on each line of the file are separated by this
         character. If sep = "", the parser will automatically detect the separator.

      col_names : list, optional
        A list of column names for the file.

      col_types : list or dict, optional
          A list of types or a dictionary of column names to types to specify whether columns
          should be forced to a certain type upon import parsing. If a list, the types for
          elements that are None will be guessed. The possible types a column may have are:
          "unknown" - this will force the column to be parsed as all NA
          "uuid"    - the values in the column must be true UUID or will be parsed as NA
          "string"  - force the column to be parsed as a string
          "numeric" - force the column to be parsed as numeric. H2O will handle the
          compression of the numeric data in the optimal manner.
          "enum"    - force the column to be parsed as a categorical column.
          "time"    - force the column to be parsed as a time column. H2O will attempt to
          parse the following list of date time formats
          date - "yyyy-MM-dd", "yyyy MM dd", "dd-MMM-yy", "dd MMM yy"
          time - "HH:mm:ss", "HH:mm:ss:SSS", "HH:mm:ss:SSSnnnnnn", "HH.mm.ss" "HH.mm.ss.SSS",
          "HH.mm.ss.SSSnnnnnn"
          Times can also contain "AM" or "PM".

      na_strings : list or dict, optional
        A list of strings, or a list of lists of strings (one list per column), or a
        dictionary of column names to strings which are to be interpreted as missing values.

    Returns
    -------
      A dictionary is returned containing all of the guesses made by the H2O back end.
    """

    # The H2O backend only accepts things that are quoted
    if is_str(raw_frames): raw_frames = [raw_frames]

    # temporary dictionary just to pass the following information to the parser: header, separator
    kwargs = {}
    # set header
    if header != (-1, 0, 1):
        if header not in (-1, 0, 1): raise ValueError("header should be -1, 0, or 1")
        kwargs["check_header"] = header

    # set separator
    if separator:
        if not is_str(separator) or len(separator) != 1:
            raise ValueError("separator should be a single character string; got %r" % separator)
        kwargs["separator"] = ord(separator)

    kwargs["source_frames"] = [quoted(id) for id in raw_frames]
    j = api("POST /3/ParseSetup", data=kwargs)
    if "warnings" in j and j["warnings"]:
        for w in j['warnings']:
            warnings.warn(w)
    # TODO: really should be url encoding...
    if destination_frame: j["destination_frame"] = destination_frame.replace("%", ".").replace("&", ".")
    if column_names is not None:
        if not isinstance(column_names, list): raise ValueError("col_names should be a list")
        if len(column_names) != len(j["column_types"]): raise ValueError(
            "length of col_names should be equal to the number of columns")
        j["column_names"] = column_names
    if column_types is not None:
        if isinstance(column_types, dict):
            # overwrite dictionary to ordered list of column types. if user didn't specify column type for all names,
            # use type provided by backend
            if j["column_names"] is None:  # no colnames discovered! (C1, C2, ...)
                j["column_names"] = gen_header(j["number_columns"])
            if not set(column_types.keys()).issubset(set(j["column_names"])): raise ValueError(
                "names specified in col_types is not a subset of the column names")
            idx = 0
            column_types_list = []
            for name in j["column_names"]:
                if name in column_types:
                    column_types_list.append(column_types[name])
                else:
                    column_types_list.append(j["column_types"][idx])
                idx += 1
            column_types = column_types_list
        elif isinstance(column_types, list):
            if len(column_types) != len(j["column_types"]): raise ValueError(
                "length of col_types should be equal to the number of columns")
            column_types = [column_types[i] if column_types[i] else j["column_types"][i] for i in
                            range(len(column_types))]
        else:  # not dictionary or list
            raise ValueError("col_types should be a list of types or a dictionary of column names to types")
        j["column_types"] = column_types
    if na_strings is not None:
        if isinstance(na_strings, dict):
            # overwrite dictionary to ordered list of lists of na_strings
            if not j["column_names"]: raise ValueError("column names should be specified")
            if not set(na_strings.keys()).issubset(set(j["column_names"])): raise ValueError(
                "names specified in na_strings is not a subset of the column names")
            j["na_strings"] = [[] for _ in range(len(j["column_names"]))]
            for name, na in na_strings.items():
                idx = j["column_names"].index(name)
                if is_str(na): na = [na]
                for n in na: j["na_strings"][idx].append(quoted(n))
        elif is_list_of_lists(na_strings):
            if len(na_strings) != len(j["column_types"]): raise ValueError(
                "length of na_strings should be equal to the number of columns")
            j["na_strings"] = [[quoted(na) for na in col] if col is not None else [] for col in na_strings]
        elif isinstance(na_strings, list):
            j["na_strings"] = [[quoted(na) for na in na_strings]] * len(j["column_types"])
        else:  # not a dictionary or list
            raise ValueError(
                "na_strings should be a list, a list of lists (one list per column), or a dictionary of column "
                "names to strings which are to be interpreted as missing values")

    # quote column names and column types also when not specified by user
    if j["column_names"]: j["column_names"] = list(map(quoted, j["column_names"]))
    j["column_types"] = list(map(quoted, j["column_types"]))
    return j
Beispiel #18
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False):
        """
        Train the H2O model.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
        :param bool verbose: Print scoring history to stdout. Defaults to False.
        """

        assert_is_type(training_frame, None, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)

        if self._requires_training_frame() and training_frame is None:
            raise H2OValueError("Training frame required for %s algorithm, but none was given.", self.algo)

        training_frame_exists = training_frame is None
        if training_frame_exists:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"})
        if not training_frame_exists:
            names = training_frame.names
            ncols = training_frame.ncols

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if not training_frame_exists:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)

            parms["offset_column"] = offset_column
            parms["fold_column"] = fold_column
            parms["weights_column"] = weights_column
            parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None: raise ValueError("Missing response")

        # Step 3
        if not training_frame_exists:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        if not training_frame_exists:
            ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else
                                 [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])

        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(verbose_model_scoring_history=verbose)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Beispiel #19
0
    def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False, extend_parms_fn=None):
        has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None
        training_frame = H2OFrame._validate(training_frame, 'training_frame',
                                            required=self._requires_training_frame() and not has_default_training_frame)
        validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)
        assert_is_type(extend_parms_fn, None, FunctionType)

        override_default_training_frame = training_frame is not None
        if not override_default_training_frame:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)
            training_frame = self.training_frame if has_default_training_frame else None

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if algo=="pca" and "k" not in parms.keys():
            parms["k"] = 1
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"})

        names = training_frame.names if training_frame is not None else []
        ncols = training_frame.ncols if training_frame is not None else 0
        types = training_frame.types if training_frame is not None else {}

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if override_default_training_frame:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is None and "ignored_columns" in parms:
                ignored_columns = parms['ignored_columns']
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)
            self._check_and_save_parm(parms, "offset_column", offset_column)
            self._check_and_save_parm(parms, "weights_column", weights_column)
            self._check_and_save_parm(parms, "fold_column", fold_column)

        if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"}
        if is_auto_encoder and y is not None:
            raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None and self.algo not in ["generic"]:
            raise ValueError("Missing response")

        # Step 3
        if override_default_training_frame:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None:
            parms["validation_frame"] = validation_frame

        if is_type(y, int):
            y = names[y]
        if y is not None:
            parms["response_column"] = y
        if not isinstance(x, (list, tuple)):
            x = [x]
        if is_type(x[0], int):
            x = [names[i] for i in x]
        if override_default_training_frame:
            ignored_columns = list(set(names) - set(x + [y, offset, folds, weights] + self._additional_used_columns(parms)))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None
                                 else [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None
                                      else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])
    
        # internal hook allowing subclasses to extend train parms 
        if extend_parms_fn is not None:
            extend_parms_fn(parms)
            
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]):
            raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(poll_updates=self._print_model_scoring_history if verbose else None)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Beispiel #20
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False):
        """
        Train the H2O model.

        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :param offset_column: The name or index of the column in training_frame that holds the offsets.
        :param fold_column: The name or index of the column in training_frame that holds the per-row fold
            assignments.
        :param weights_column: The name or index of the column in training_frame that holds the per-row weights.
        :param validation_frame: H2OFrame with validation data to be scored on while training.
        :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable.
        :param bool verbose: Print scoring history to stdout. Defaults to False.
        """

        assert_is_type(training_frame, None, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)

        if self._requires_training_frame() and training_frame is None:
            raise H2OValueError("Training frame required for %s algorithm, but none was given.", self.algo)

        training_frame_exists = training_frame is None
        if training_frame_exists:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"})
        if not training_frame_exists:
            names = training_frame.names
            ncols = training_frame.ncols

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if not training_frame_exists:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)

            parms["offset_column"] = offset_column
            parms["fold_column"] = fold_column
            parms["weights_column"] = weights_column

        if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None: raise ValueError("Missing response")

        # Step 3
        if not training_frame_exists:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        if not training_frame_exists:
            ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else
                                 [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])

        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]):
            raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(verbose_model_scoring_history=verbose)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Beispiel #21
0
    def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
              model_id=None, verbose=False, extend_parms_fn=None):
        assert_is_type(training_frame, None, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        assert_is_type(model_id, None, str)
        assert_is_type(verbose, bool)
        assert_is_type(extend_parms_fn, None, FunctionType)

        if self._requires_training_frame() and training_frame is None:
            raise H2OValueError("Training frame required for %s algorithm, but none was given." % self.algo)

        training_frame_exists = training_frame is None
        if training_frame_exists:
            self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)

        algo = self.algo
        if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]:
            raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models")
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"})
        if not training_frame_exists:
            names = training_frame.names
            ncols = training_frame.ncols

        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        else:
            # If `y` is provided for an unsupervised model we'll simply ignore
            # it. This way an unsupervised model can be used as a step in
            # sklearn's pipeline.
            y = None

        if not training_frame_exists:
            assert_is_type(y, str, None)
            ignored_columns_set = set()
            if ignored_columns is None and "ignored_columns" in parms:
                ignored_columns = parms['ignored_columns']
            if ignored_columns is not None:
                if x is not None:
                    raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
                for ic in ignored_columns:
                    if is_type(ic, int):
                        if not (-ncols <= ic < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % ic)
                        ignored_columns_set.add(names[ic])
                    else:
                        if ic not in names:
                            raise H2OValueError("Column %s not in the training frame" % ic)
                        ignored_columns_set.add(ic)
            if x is None:
                xset = set(names) - {y} - ignored_columns_set
            else:
                xset = set()
                if is_type(x, int, str): x = [x]
                for xi in x:
                    if is_type(xi, int):
                        if not (-ncols <= xi < ncols):
                            raise H2OValueError("Column %d does not exist in the training frame" % xi)
                        xset.add(names[xi])
                    else:
                        if xi not in names:
                            raise H2OValueError("Column %s not in the training frame" % xi)
                        xset.add(xi)
            x = list(xset)
            self._check_and_save_parm(parms, "offset_column", offset_column)
            self._check_and_save_parm(parms, "weights_column", weights_column)
            self._check_and_save_parm(parms, "fold_column", fold_column)

        if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs

        # Overwrites the model_id parameter only if model_id is passed
        if model_id is not None:
            parms["model_id"] = model_id

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None and self.algo not in ["generic"]: raise ValueError("Missing response")

        # Step 3
        if not training_frame_exists:
            parms["training_frame"] = training_frame
            offset = parms["offset_column"]
            folds = parms["fold_column"]
            weights = parms["weights_column"]

        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        if not training_frame_exists:
            ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
            parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else
                                 [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])
    
        # internal hook allowing subclasses to extend train parms 
        if extend_parms_fn is not None:
            extend_parms_fn(parms)
            
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]):
            raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms)
        model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll(verbose_model_scoring_history=verbose)
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)
Beispiel #22
0
 def _make_parms(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
                 weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None,
                 model_id=None, verbose=False, extend_parms_fn=None):
     has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None
     training_frame = H2OFrame._validate(training_frame, 'training_frame',
                                         required=self._options_.get('requires_training_frame', True) and not has_default_training_frame)
     validation_frame = H2OFrame._validate(validation_frame, 'validation_frame')
     assert_is_type(y, None, int, str)
     assert_is_type(x, None, int, str, [str, int], {str, int})
     assert_is_type(ignored_columns, None, [str, int], {str, int})
     assert_is_type(offset_column, None, int, str)
     assert_is_type(fold_column, None, int, str)
     assert_is_type(weights_column, None, int, str)
     assert_is_type(max_runtime_secs, None, numeric)
     assert_is_type(model_id, None, str)
     assert_is_type(verbose, bool)
     assert_is_type(extend_parms_fn, None, FunctionType)
 
     override_default_training_frame = training_frame is not None
     if not override_default_training_frame:
         self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame)
         training_frame = self.training_frame if has_default_training_frame else None
 
     if verbose and not self._options_.get('verbose', False):
         raise H2OValueError("Verbose mode is not available for %s" % self.__class__.__name__)
     parms = self._parms.copy()
     names = training_frame.names if training_frame is not None else []
     ncols = training_frame.ncols if training_frame is not None else 0
     types = training_frame.types if training_frame is not None else {}
 
     if self.supervised_learning:
         if y is None: y = "response"
         if is_type(y, int):
             if not (-ncols <= y < ncols):
                 raise H2OValueError("Column %d does not exist in the training frame" % y)
             y = names[y]
         else:
             if y not in names:
                 raise H2OValueError("Column %s does not exist in the training frame" % y)
         self._estimator_type = "classifier" if types[y] == "enum" else "regressor"
     else:
         # If `y` is provided for an unsupervised model we'll simply ignore
         # it. This way an unsupervised model can be used as a step in
         # sklearn's pipeline.
         y = None
         self._estimator_type = "unsupervised"
 
     if override_default_training_frame:
         assert_is_type(y, str, None)
         ignored_columns_set = set()
         if ignored_columns is None and "ignored_columns" in parms:
             ignored_columns = parms['ignored_columns']
         if ignored_columns is not None:
             if x is not None:
                 raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
             for ic in ignored_columns:
                 if is_type(ic, int):
                     if not (-ncols <= ic < ncols):
                         raise H2OValueError("Column %d does not exist in the training frame" % ic)
                     ignored_columns_set.add(names[ic])
                 else:
                     if ic not in names:
                         raise H2OValueError("Column %s not in the training frame" % ic)
                     ignored_columns_set.add(ic)
         if x is None:
             xset = set(names) - {y} - ignored_columns_set
         else:
             xset = set()
             if is_type(x, int, str): x = [x]
             for xi in x:
                 if is_type(xi, int):
                     if not (-ncols <= xi < ncols):
                         raise H2OValueError("Column %d does not exist in the training frame" % xi)
                     xset.add(names[xi])
                 else:
                     if xi not in names:
                         raise H2OValueError("Column %s not in the training frame" % xi)
                     xset.add(xi)
         x = list(xset)
         self._check_and_save_parm(parms, "offset_column", offset_column)
         self._check_and_save_parm(parms, "weights_column", weights_column)
         self._check_and_save_parm(parms, "fold_column", fold_column)
 
     if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs
 
     # Overwrites the model_id parameter only if model_id is passed
     if model_id is not None:
         parms["model_id"] = model_id
     if override_default_training_frame:
         parms["training_frame"] = training_frame
         offset = parms["offset_column"]
         folds = parms["fold_column"]
         weights = parms["weights_column"]
 
     if validation_frame is not None:
         parms["validation_frame"] = validation_frame
 
     if is_type(y, int):
         y = names[y]
     if y is not None:
         parms["response_column"] = y
     if not isinstance(x, (list, tuple)):
         x = [x]
     if len(x) > 0 and is_type(x[0], int):
         x = [names[i] for i in x]
     if override_default_training_frame:
         ignored_columns = list(set(names) - set(x + [y, offset, folds, weights]))
         parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
     parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None
                              else [quoted(col) for col in parms["interactions"]])
     parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None
                                   else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]])
 
     # internal hook allowing subclasses to extend train parms 
     if extend_parms_fn is not None:
         extend_parms_fn(parms)
 
     parms = {k: H2OEstimator._keyify(v) for k, v in parms.items()}
     if "r2" in (parms.get('stopping_metric') or []):
         raise H2OValueError("r2 cannot be used as an early stopping_metric yet.  Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.")
     return parms
Beispiel #23
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None,
              weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None):
        """
        Train the H2O model.

        Parameters
        ----------
        x : list, None
            A list of column names or indices indicating the predictor columns.

        y :
            An index or a column name indicating the response column.

        training_frame : H2OFrame
            The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).

        offset_column : str, optional
            The name or index of the column in training_frame that holds the offsets.

        fold_column : str, optional
            The name or index of the column in training_frame that holds the per-row fold
            assignments.

        weights_column : str, optional
            The name or index of the column in training_frame that holds the per-row weights.

        validation_frame : H2OFrame, optional
            H2OFrame with validation data to be scored on while training.

        max_runtime_secs : float
            Maximum allowed runtime in seconds for model training. Use 0 to disable.
        """
        assert_is_type(training_frame, H2OFrame)
        assert_is_type(validation_frame, None, H2OFrame)
        assert_is_type(y, None, int, str)
        assert_is_type(x, None, int, str, [str, int], {str, int})
        assert_is_type(ignored_columns, None, [str, int], {str, int})
        assert_is_type(offset_column, None, int, str)
        assert_is_type(fold_column, None, int, str)
        assert_is_type(weights_column, None, int, str)
        assert_is_type(max_runtime_secs, None, numeric)
        algo = self.algo
        parms = self._parms.copy()
        if "__class__" in parms:  # FIXME: hackt for PY3
            del parms["__class__"]
        is_auto_encoder = bool(parms.get("autoencoder"))
        is_supervised = not(is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm", "word2vec"})
        ncols = training_frame.ncols
        names = training_frame.names
        if is_supervised:
            if y is None: y = "response"
            if is_type(y, int):
                if not (-ncols <= y < ncols):
                    raise H2OValueError("Column %d does not exist in the training frame" % y)
                y = names[y]
            else:
                if y not in names:
                    raise H2OValueError("Column %s does not exist in the training frame" % y)
            self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor"
        elif y is not None:
            raise H2OValueError("y should not be provided for an unsupervised model")
        assert_is_type(y, str, None)
        ignored_columns_set = set()
        if ignored_columns is not None:
            if x is not None:
                raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously")
            for ic in ignored_columns:
                if is_type(ic, int):
                    if not (-ncols <= ic < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % ic)
                    ignored_columns_set.add(names[ic])
                else:
                    if ic not in names:
                        raise H2OValueError("Column %s not in the training frame" % ic)
                    ignored_columns_set.add(ic)
        if x is None:
            xset = set(names) - {y} - ignored_columns_set
        else:
            xset = set()
            if is_type(x, int, str): x = [x]
            for xi in x:
                if is_type(xi, int):
                    if not (-ncols <= xi < ncols):
                        raise H2OValueError("Column %d does not exist in the training frame" % xi)
                    xset.add(names[xi])
                else:
                    if xi not in names:
                        raise H2OValueError("Column %s not in the training frame" % xi)
                    xset.add(xi)
        x = list(xset)

        parms["offset_column"] = offset_column
        parms["fold_column"] = fold_column
        parms["weights_column"] = weights_column
        parms["max_runtime_secs"] = max_runtime_secs

        # Step 2
        is_auto_encoder = "autoencoder" in parms and parms["autoencoder"]
        is_unsupervised = is_auto_encoder or self.algo in {"pca", "svd", "kmeans", "glrm", "word2vec"}
        if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.")
        if not is_unsupervised and y is None: raise ValueError("Missing response")

        # Step 3
        parms["training_frame"] = training_frame
        if validation_frame is not None: parms["validation_frame"] = validation_frame
        if is_type(y, int): y = training_frame.names[y]
        if y is not None: parms["response_column"] = y
        if not isinstance(x, (list, tuple)): x = [x]
        if is_type(x[0], int):
            x = [training_frame.names[i] for i in x]
        offset = parms["offset_column"]
        folds = parms["fold_column"]
        weights = parms["weights_column"]
        ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights]))
        parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns]
        parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else
                                 [quoted(col) for col in parms["interactions"]])
        parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms}
        rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3

        model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms),
                       job_type=(self.algo + " Model Build"))

        if self._future:
            self._job = model
            self._rest_version = rest_ver
            return

        model.poll()
        model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0]
        self._resolve_model(model.dest_key, model_json)