def _keyify_if_h2oframe(item): if isinstance(item, H2OFrame): return item.frame_id elif isinstance(item, list) and all(i is None or isinstance(i, H2OFrame) for i in item): return [quoted(i) if i is None else quoted(i.frame_id) for i in item] else: return item
def parse_raw(setup, id=None, first_line_is_header=(-1, 0, 1)): """Used in conjunction with lazy_import and parse_setup in order to make alterations before parsing. Parameters ---------- setup : dict Result of h2o.parse_setup id : str, optional An id for the frame. first_line_is_header : int, optional -1,0,1 if the first line is to be used as the header Returns ------- H2OFrame """ if id: setup["destination_frame"] = quoted(id).replace("%", ".").replace("&", ".") if first_line_is_header != (-1, 0, 1): if first_line_is_header not in (-1, 0, 1): raise ValueError("first_line_is_header should be -1, 0, or 1") setup["check_header"] = first_line_is_header fr = H2OFrame() fr._parse_raw(setup) return fr
def _keyify(item): if isinstance(item, Keyed): return item.key elif isinstance(item, list) and any(isinstance(i, Keyed) for i in item): return [quoted(H2OEstimator._keyify(i)) for i in item] else: return item
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs['training_frame'] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_type(y, int): y = tframe.names[y] if y is not None: kwargs['response_column'] = y if not is_type(x, list, tuple): x = [x] if is_type(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns] kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # gruesome one-liner algo = self.model._compute_algo() # unique to grid search if self.grid_id is not None: kwargs["grid_id"] = self.grid_id rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build")) if self._future: self._job = grid return grid.poll() grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key)) failure_messages_stacks = "" error_index = 0 if len(grid_json["failure_details"]) > 0: print("Errors/Warnings building gridsearch model\n") # will raise error if no grid model is returned, store error messages here for error_message in grid_json["failure_details"]: if isinstance(grid_json["failed_params"][error_index], dict): for h_name in grid_json['hyper_names']: print("Hyper-parameter: {0}, {1}".format(h_name, grid_json['failed_params'][error_index][h_name])) if len(grid_json["failure_stack_traces"]) > error_index: print("failure_details: {0}\nfailure_stack_traces: " "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index])) failure_messages_stacks += error_message+'\n' error_index += 1 self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc) # sometimes no model is returned due to bad parameter values provided by the user. if len(grid_json['model_ids']) > 0: first_model_json = h2o.api("GET /%d/Models/%s" % (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0] self._resolve_grid(grid.dest_key, grid_json, first_model_json) else: if len(failure_messages_stacks)>0: raise ValueError(failure_messages_stacks) else: raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
def fit(self, fr, **fit_params): res = [] for step in self.steps: res.append(step[1].to_rest(step[0])) res = "[" + ",".join([quoted(r.replace('"', "'")) for r in res]) + "]" j = h2o.api("POST /99/Assembly", data={"steps": res, "frame": fr.frame_id}) self.id = j["assembly"]["name"] return H2OFrame.get_frame(j["result"]["name"])
def _model_build(self, x, y, tframe, vframe, kwargs): kwargs["training_frame"] = tframe if vframe is not None: kwargs["validation_frame"] = vframe if is_type(y, int): y = tframe.names[y] if y is not None: kwargs["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [tframe.names[i] for i in x] offset = kwargs["offset_column"] folds = kwargs["fold_column"] weights = kwargs["weights_column"] ignored_columns = list( set(tframe.names) - set(x + [y, offset, folds, weights])) kwargs["ignored_columns"] = None if ignored_columns == [] else [ quoted(col) for col in ignored_columns ] kwargs["interactions"] = (None if "interactions" not in kwargs or kwargs["interactions"] is None else [ quoted(col) for col in kwargs["interactions"] ]) kwargs = { k: H2OEstimator._keyify_if_h2oframe(kwargs[k]) for k in kwargs } rest_ver = kwargs.pop( "_rest_version") if "_rest_version" in kwargs else 3 model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=kwargs), job_type=(self.algo + " Model Build")) if self._future: self._job = model return model.poll() model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def fit(self, fr, **fit_params): res = [] for step in self.steps: res.append(step[1].to_rest(step[0])) res = "[" + ",".join([quoted(r.replace('"', "'")) for r in res]) + "]" j = h2o.api("POST /99/Assembly", data={ "steps": res, "frame": fr.frame_id }) self.id = j["assembly"]["name"] return H2OFrame.get_frame(j["result"]["name"])
def fit(self, fr): assert_is_type(fr, H2OFrame) steps = "[%s]" % ",".join( quoted(step[1].to_rest(step[0]).replace('"', "'")) for step in self.steps) j = h2o.api("POST /99/Assembly", data={ "steps": steps, "frame": fr.frame_id }) self.id = j["assembly"]["name"] return H2OFrame.get_frame(j["result"]["name"])
def fit(self, fr): """ To perform the munging operations on a frame specified in steps on the frame fr. :param fr: H2OFrame where munging operations are to be performed on. :return: H2OFrame after munging operations are completed. """ assert_is_type(fr, H2OFrame) steps = "[%s]" % ",".join(quoted(step[1].to_rest(step[0]).replace('"', "'")) for step in self.steps) j = h2o.api("POST /99/Assembly", data={"steps": steps, "frame": fr.frame_id}) self.id = j["assembly"]["name"] return H2OFrame.get_frame(j["result"]["name"])
def fit(self, fr): """ To perform the munging operations on a frame specified in steps on the frame fr. :param fr: H2OFrame where munging operations are to be performed on. :return: H2OFrame after munging operations are completed. """ assert_is_type(fr, H2OFrame) steps = "[%s]" % ",".join( quoted(step[1].to_rest(step[0]).replace('"', "'")) for step in self.steps) j = h2o.api("POST /99/Assembly", data={ "steps": steps, "frame": fr.frame_id }) self.id = j["assembly"]["name"] return H2OFrame.get_frame(j["result"]["name"])
def interaction(data, factors, pairwise, max_factors, min_occurrence, destination_frame=None): """ Categorical Interaction Feature Creation in H2O. Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by the user. Parameters ---------- data : H2OFrame the H2OFrame that holds the target categorical columns. factors : list factors Factor columns (either indices or column names). pairwise : bool Whether to create pairwise interactions between factors (otherwise create one higher-order interaction). Only applicable if there are 3 or more factors. max_factors : int Max. number of factor levels in pair-wise interaction terms (if enforced, one extra catch-all factor will be made) min_occurrence : int Min. occurrence threshold for factor levels in pair-wise interaction terms destination_frame : str A string indicating the destination key. If empty, this will be auto-generated by H2O. Returns ------- H2OFrame """ factors = [data.names[n] if is_int(n) else n for n in factors] parms = {"dest": py_tmp_key(append=h2oconn.session_id) if destination_frame is None else destination_frame, "source_frame": data.frame_id, "factor_columns": [quoted(f) for f in factors], "pairwise": pairwise, "max_factors": max_factors, "min_occurrence": min_occurrence, } H2OJob(api("POST /3/Interaction", data=parms), "Interactions").poll() return get_frame(parms["dest"])
def fit(self, fr): """ To perform the munging operations on a frame specified in steps on the frame fr. :param fr: H2OFrame where munging operations are to be performed on. :return: H2OFrame after munging operations are completed. :examples: >>> iris = h2o.load_dataset("iris") >>> assembly = H2OAssembly(steps=[("col_select", ... H2OColSelect(["Sepal.Length", ... "Petal.Length", "Species"])), ... ("cos_Sepal.Length", ... H2OColOp(op=H2OFrame.cos, ... col="Sepal.Length", ... inplace=True)), ... ("str_cnt_Species", ... H2OColOp(op=H2OFrame.countmatches, ... col="Species", ... inplace=False, ... pattern="s"))]) >>> fit = assembly.fit(iris) >>> fit """ assert_is_type(fr, H2OFrame) steps = "[%s]" % ",".join( quoted(step[1].to_rest(step[0]).replace('"', "'")) for step in self.steps) j = h2o.api("POST /99/Assembly", data={ "steps": steps, "frame": fr.frame_id }) self.id = j["assembly"]["name"] return H2OFrame.get_frame(j["result"]["name"])
def extend_parms(parms): if blending_frame is not None: parms['blending_frame'] = blending_frame if self.metalearner_fold_column is not None: parms['ignored_columns'].remove( quoted(self.metalearner_fold_column))
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None): """ Train the H2O model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. """ assert_is_type(training_frame, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) algo = self.algo parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm", "word2vec"}) ncols = training_frame.ncols names = training_frame.names if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" elif y is not None: raise H2OValueError("y should not be provided for an unsupervised model") assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"pca", "svd", "kmeans", "glrm", "word2vec"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None: raise ValueError("Missing response") # Step 3 parms["training_frame"] = training_frame if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms), job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll() model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def parse_setup(raw_frames, destination_frame="", header=(-1, 0, 1), separator="", column_names=None, column_types=None, na_strings=None): """During parse setup, the H2O cluster will make several guesses about the attributes of the data. This method allows a user to perform corrective measures by updating the returning dictionary from this method. This dictionary is then fed into `parse_raw` to produce the H2OFrame instance. Parameters ---------- raw_frames : H2OFrame A collection of imported file frames destination_frame : str, optional The unique hex key assigned to the imported file. If none is given, a key will automatically be generated. parse : bool, optional A logical value indicating whether the file should be parsed after import. header : int, optional -1 means the first line is data, 0 means guess, 1 means first line is header. sep : str, optional The field separator character. Values on each line of the file are separated by this character. If sep = "", the parser will automatically detect the separator. col_names : list, optional A list of column names for the file. col_types : list or dict, optional A list of types or a dictionary of column names to types to specify whether columns should be forced to a certain type upon import parsing. If a list, the types for elements that are None will be guessed. The possible types a column may have are: "unknown" - this will force the column to be parsed as all NA "uuid" - the values in the column must be true UUID or will be parsed as NA "string" - force the column to be parsed as a string "numeric" - force the column to be parsed as numeric. H2O will handle the compression of the numeric data in the optimal manner. "enum" - force the column to be parsed as a categorical column. "time" - force the column to be parsed as a time column. H2O will attempt to parse the following list of date time formats date - "yyyy-MM-dd", "yyyy MM dd", "dd-MMM-yy", "dd MMM yy" time - "HH:mm:ss", "HH:mm:ss:SSS", "HH:mm:ss:SSSnnnnnn", "HH.mm.ss" "HH.mm.ss.SSS", "HH.mm.ss.SSSnnnnnn" Times can also contain "AM" or "PM". na_strings : list or dict, optional A list of strings, or a list of lists of strings (one list per column), or a dictionary of column names to strings which are to be interpreted as missing values. Returns ------- A dictionary is returned containing all of the guesses made by the H2O back end. """ # The H2O backend only accepts things that are quoted if is_str(raw_frames): raw_frames = [raw_frames] # temporary dictionary just to pass the following information to the parser: header, separator kwargs = {} # set header if header != (-1, 0, 1): if header not in (-1, 0, 1): raise ValueError("header should be -1, 0, or 1") kwargs["check_header"] = header # set separator if separator: if not is_str(separator) or len(separator) != 1: raise ValueError("separator should be a single character string; got %r" % separator) kwargs["separator"] = ord(separator) kwargs["source_frames"] = [quoted(id) for id in raw_frames] j = api("POST /3/ParseSetup", data=kwargs) if "warnings" in j and j["warnings"]: for w in j['warnings']: warnings.warn(w) # TODO: really should be url encoding... if destination_frame: j["destination_frame"] = destination_frame.replace("%", ".").replace("&", ".") if column_names is not None: if not isinstance(column_names, list): raise ValueError("col_names should be a list") if len(column_names) != len(j["column_types"]): raise ValueError( "length of col_names should be equal to the number of columns") j["column_names"] = column_names if column_types is not None: if isinstance(column_types, dict): # overwrite dictionary to ordered list of column types. if user didn't specify column type for all names, # use type provided by backend if j["column_names"] is None: # no colnames discovered! (C1, C2, ...) j["column_names"] = gen_header(j["number_columns"]) if not set(column_types.keys()).issubset(set(j["column_names"])): raise ValueError( "names specified in col_types is not a subset of the column names") idx = 0 column_types_list = [] for name in j["column_names"]: if name in column_types: column_types_list.append(column_types[name]) else: column_types_list.append(j["column_types"][idx]) idx += 1 column_types = column_types_list elif isinstance(column_types, list): if len(column_types) != len(j["column_types"]): raise ValueError( "length of col_types should be equal to the number of columns") column_types = [column_types[i] if column_types[i] else j["column_types"][i] for i in range(len(column_types))] else: # not dictionary or list raise ValueError("col_types should be a list of types or a dictionary of column names to types") j["column_types"] = column_types if na_strings is not None: if isinstance(na_strings, dict): # overwrite dictionary to ordered list of lists of na_strings if not j["column_names"]: raise ValueError("column names should be specified") if not set(na_strings.keys()).issubset(set(j["column_names"])): raise ValueError( "names specified in na_strings is not a subset of the column names") j["na_strings"] = [[] for _ in range(len(j["column_names"]))] for name, na in na_strings.items(): idx = j["column_names"].index(name) if is_str(na): na = [na] for n in na: j["na_strings"][idx].append(quoted(n)) elif is_list_of_lists(na_strings): if len(na_strings) != len(j["column_types"]): raise ValueError( "length of na_strings should be equal to the number of columns") j["na_strings"] = [[quoted(na) for na in col] if col is not None else [] for col in na_strings] elif isinstance(na_strings, list): j["na_strings"] = [[quoted(na) for na in na_strings]] * len(j["column_types"]) else: # not a dictionary or list raise ValueError( "na_strings should be a list, a list of lists (one list per column), or a dictionary of column " "names to strings which are to be interpreted as missing values") # quote column names and column types also when not specified by user if j["column_names"]: j["column_names"] = list(map(quoted, j["column_names"])) j["column_types"] = list(map(quoted, j["column_types"])) return j
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False): """ Train the H2O model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. :param bool verbose: Print scoring history to stdout. Defaults to False. """ assert_is_type(training_frame, None, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) if self._requires_training_frame() and training_frame is None: raise H2OValueError("Training frame required for %s algorithm, but none was given.", self.algo) training_frame_exists = training_frame is None if training_frame_exists: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}) if not training_frame_exists: names = training_frame.names ncols = training_frame.ncols if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if not training_frame_exists: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None: raise ValueError("Missing response") # Step 3 if not training_frame_exists: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] if not training_frame_exists: ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(verbose_model_scoring_history=verbose) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False, extend_parms_fn=None): has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None training_frame = H2OFrame._validate(training_frame, 'training_frame', required=self._requires_training_frame() and not has_default_training_frame) validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) assert_is_type(extend_parms_fn, None, FunctionType) override_default_training_frame = training_frame is not None if not override_default_training_frame: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) training_frame = self.training_frame if has_default_training_frame else None algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if algo=="pca" and "k" not in parms.keys(): parms["k"] = 1 if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"}) names = training_frame.names if training_frame is not None else [] ncols = training_frame.ncols if training_frame is not None else 0 types = training_frame.types if training_frame is not None else {} if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if override_default_training_frame: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is None and "ignored_columns" in parms: ignored_columns = parms['ignored_columns'] if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) self._check_and_save_parm(parms, "offset_column", offset_column) self._check_and_save_parm(parms, "weights_column", weights_column) self._check_and_save_parm(parms, "fold_column", fold_column) if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None and self.algo not in ["generic"]: raise ValueError("Missing response") # Step 3 if override_default_training_frame: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [names[i] for i in x] if override_default_training_frame: ignored_columns = list(set(names) - set(x + [y, offset, folds, weights] + self._additional_used_columns(parms))) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) # internal hook allowing subclasses to extend train parms if extend_parms_fn is not None: extend_parms_fn(parms) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(poll_updates=self._print_model_scoring_history if verbose else None) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False): """ Train the H2O model. :param x: A list of column names or indices indicating the predictor columns. :param y: An index or a column name indicating the response column. :param H2OFrame training_frame: The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). :param offset_column: The name or index of the column in training_frame that holds the offsets. :param fold_column: The name or index of the column in training_frame that holds the per-row fold assignments. :param weights_column: The name or index of the column in training_frame that holds the per-row weights. :param validation_frame: H2OFrame with validation data to be scored on while training. :param float max_runtime_secs: Maximum allowed runtime in seconds for model training. Use 0 to disable. :param bool verbose: Print scoring history to stdout. Defaults to False. """ assert_is_type(training_frame, None, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) if self._requires_training_frame() and training_frame is None: raise H2OValueError("Training frame required for %s algorithm, but none was given.", self.algo) training_frame_exists = training_frame is None if training_frame_exists: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"}) if not training_frame_exists: names = training_frame.names ncols = training_frame.ncols if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if not training_frame_exists: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None: raise ValueError("Missing response") # Step 3 if not training_frame_exists: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] if not training_frame_exists: ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(verbose_model_scoring_history=verbose) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def _train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False, extend_parms_fn=None): assert_is_type(training_frame, None, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) assert_is_type(extend_parms_fn, None, FunctionType) if self._requires_training_frame() and training_frame is None: raise H2OValueError("Training frame required for %s algorithm, but none was given." % self.algo) training_frame_exists = training_frame is None if training_frame_exists: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) algo = self.algo if verbose and algo not in ["drf", "gbm", "deeplearning", "xgboost"]: raise H2OValueError("Verbose should only be set to True for drf, gbm, deeplearning, and xgboost models") parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest", "generic"}) if not training_frame_exists: names = training_frame.names ncols = training_frame.ncols if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None if not training_frame_exists: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is None and "ignored_columns" in parms: ignored_columns = parms['ignored_columns'] if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) self._check_and_save_parm(parms, "offset_column", offset_column) self._check_and_save_parm(parms, "weights_column", weights_column) self._check_and_save_parm(parms, "fold_column", fold_column) if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"aggregator", "pca", "svd", "kmeans", "glrm", "word2vec", "isolationforest"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None and self.algo not in ["generic"]: raise ValueError("Missing response") # Step 3 if not training_frame_exists: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] if not training_frame_exists: ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) # internal hook allowing subclasses to extend train parms if extend_parms_fn is not None: extend_parms_fn(parms) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} if ("stopping_metric" in parms.keys()) and ("r2" in parms["stopping_metric"]): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model_builder_json = h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms) model = H2OJob(model_builder_json, job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll(verbose_model_scoring_history=verbose) model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)
def _make_parms(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None, model_id=None, verbose=False, extend_parms_fn=None): has_default_training_frame = hasattr(self, 'training_frame') and self.training_frame is not None training_frame = H2OFrame._validate(training_frame, 'training_frame', required=self._options_.get('requires_training_frame', True) and not has_default_training_frame) validation_frame = H2OFrame._validate(validation_frame, 'validation_frame') assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) assert_is_type(model_id, None, str) assert_is_type(verbose, bool) assert_is_type(extend_parms_fn, None, FunctionType) override_default_training_frame = training_frame is not None if not override_default_training_frame: self._verify_training_frame_params(offset_column, fold_column, weights_column, validation_frame) training_frame = self.training_frame if has_default_training_frame else None if verbose and not self._options_.get('verbose', False): raise H2OValueError("Verbose mode is not available for %s" % self.__class__.__name__) parms = self._parms.copy() names = training_frame.names if training_frame is not None else [] ncols = training_frame.ncols if training_frame is not None else 0 types = training_frame.types if training_frame is not None else {} if self.supervised_learning: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if types[y] == "enum" else "regressor" else: # If `y` is provided for an unsupervised model we'll simply ignore # it. This way an unsupervised model can be used as a step in # sklearn's pipeline. y = None self._estimator_type = "unsupervised" if override_default_training_frame: assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is None and "ignored_columns" in parms: ignored_columns = parms['ignored_columns'] if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) self._check_and_save_parm(parms, "offset_column", offset_column) self._check_and_save_parm(parms, "weights_column", weights_column) self._check_and_save_parm(parms, "fold_column", fold_column) if max_runtime_secs is not None: parms["max_runtime_secs"] = max_runtime_secs # Overwrites the model_id parameter only if model_id is passed if model_id is not None: parms["model_id"] = model_id if override_default_training_frame: parms["training_frame"] = training_frame offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if len(x) > 0 and is_type(x[0], int): x = [names[i] for i in x] if override_default_training_frame: ignored_columns = list(set(names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms["interaction_pairs"] = (None if "interaction_pairs" not in parms or parms["interaction_pairs"] is None else [tuple(map(quoted, ip)) for ip in parms["interaction_pairs"]]) # internal hook allowing subclasses to extend train parms if extend_parms_fn is not None: extend_parms_fn(parms) parms = {k: H2OEstimator._keyify(v) for k, v in parms.items()} if "r2" in (parms.get('stopping_metric') or []): raise H2OValueError("r2 cannot be used as an early stopping_metric yet. Check this JIRA https://0xdata.atlassian.net/browse/PUBDEV-5381 for progress.") return parms
def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None, validation_frame=None, max_runtime_secs=None, ignored_columns=None): """ Train the H2O model. Parameters ---------- x : list, None A list of column names or indices indicating the predictor columns. y : An index or a column name indicating the response column. training_frame : H2OFrame The H2OFrame having the columns indicated by x and y (as well as any additional columns specified by fold, offset, and weights). offset_column : str, optional The name or index of the column in training_frame that holds the offsets. fold_column : str, optional The name or index of the column in training_frame that holds the per-row fold assignments. weights_column : str, optional The name or index of the column in training_frame that holds the per-row weights. validation_frame : H2OFrame, optional H2OFrame with validation data to be scored on while training. max_runtime_secs : float Maximum allowed runtime in seconds for model training. Use 0 to disable. """ assert_is_type(training_frame, H2OFrame) assert_is_type(validation_frame, None, H2OFrame) assert_is_type(y, None, int, str) assert_is_type(x, None, int, str, [str, int], {str, int}) assert_is_type(ignored_columns, None, [str, int], {str, int}) assert_is_type(offset_column, None, int, str) assert_is_type(fold_column, None, int, str) assert_is_type(weights_column, None, int, str) assert_is_type(max_runtime_secs, None, numeric) algo = self.algo parms = self._parms.copy() if "__class__" in parms: # FIXME: hackt for PY3 del parms["__class__"] is_auto_encoder = bool(parms.get("autoencoder")) is_supervised = not(is_auto_encoder or algo in {"pca", "svd", "kmeans", "glrm", "word2vec"}) ncols = training_frame.ncols names = training_frame.names if is_supervised: if y is None: y = "response" if is_type(y, int): if not (-ncols <= y < ncols): raise H2OValueError("Column %d does not exist in the training frame" % y) y = names[y] else: if y not in names: raise H2OValueError("Column %s does not exist in the training frame" % y) self._estimator_type = "classifier" if training_frame.types[y] == "enum" else "regressor" elif y is not None: raise H2OValueError("y should not be provided for an unsupervised model") assert_is_type(y, str, None) ignored_columns_set = set() if ignored_columns is not None: if x is not None: raise H2OValueError("Properties x and ignored_columns cannot be specified simultaneously") for ic in ignored_columns: if is_type(ic, int): if not (-ncols <= ic < ncols): raise H2OValueError("Column %d does not exist in the training frame" % ic) ignored_columns_set.add(names[ic]) else: if ic not in names: raise H2OValueError("Column %s not in the training frame" % ic) ignored_columns_set.add(ic) if x is None: xset = set(names) - {y} - ignored_columns_set else: xset = set() if is_type(x, int, str): x = [x] for xi in x: if is_type(xi, int): if not (-ncols <= xi < ncols): raise H2OValueError("Column %d does not exist in the training frame" % xi) xset.add(names[xi]) else: if xi not in names: raise H2OValueError("Column %s not in the training frame" % xi) xset.add(xi) x = list(xset) parms["offset_column"] = offset_column parms["fold_column"] = fold_column parms["weights_column"] = weights_column parms["max_runtime_secs"] = max_runtime_secs # Step 2 is_auto_encoder = "autoencoder" in parms and parms["autoencoder"] is_unsupervised = is_auto_encoder or self.algo in {"pca", "svd", "kmeans", "glrm", "word2vec"} if is_auto_encoder and y is not None: raise ValueError("y should not be specified for autoencoder.") if not is_unsupervised and y is None: raise ValueError("Missing response") # Step 3 parms["training_frame"] = training_frame if validation_frame is not None: parms["validation_frame"] = validation_frame if is_type(y, int): y = training_frame.names[y] if y is not None: parms["response_column"] = y if not isinstance(x, (list, tuple)): x = [x] if is_type(x[0], int): x = [training_frame.names[i] for i in x] offset = parms["offset_column"] folds = parms["fold_column"] weights = parms["weights_column"] ignored_columns = list(set(training_frame.names) - set(x + [y, offset, folds, weights])) parms["ignored_columns"] = None if ignored_columns == [] else [quoted(col) for col in ignored_columns] parms["interactions"] = (None if "interactions" not in parms or parms["interactions"] is None else [quoted(col) for col in parms["interactions"]]) parms = {k: H2OEstimator._keyify_if_h2oframe(parms[k]) for k in parms} rest_ver = parms.pop("_rest_version") if "_rest_version" in parms else 3 model = H2OJob(h2o.api("POST /%d/ModelBuilders/%s" % (rest_ver, self.algo), data=parms), job_type=(self.algo + " Model Build")) if self._future: self._job = model self._rest_version = rest_ver return model.poll() model_json = h2o.api("GET /%d/Models/%s" % (rest_ver, model.dest_key))["models"][0] self._resolve_model(model.dest_key, model_json)