def interaction(data, factors, pairwise, max_factors, min_occurrence, destination_frame=None): """ Categorical Interaction Feature Creation in H2O. Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by the user. :param data: the H2OFrame that holds the target categorical columns. :param factors: factors Factor columns (either indices or column names). :param pairwise: Whether to create pairwise interactions between factors (otherwise create one higher-order interaction). Only applicable if there are 3 or more factors. :param max_factors: Max. number of factor levels in pair-wise interaction terms (if enforced, one extra catch-all factor will be made) :param min_occurrence: Min. occurrence threshold for factor levels in pair-wise interaction terms :param destination_frame: A string indicating the destination key. If empty, this will be auto-generated by H2O. :return: H2OFrame """ data._eager() factors = [data.names()[n] if isinstance(n,int) else n for n in factors] parms = {"dest": _py_tmp_key() if destination_frame is None else destination_frame, "source_frame": data._id, "factor_columns": [_quoted(f) for f in factors], "pairwise": pairwise, "max_factors": max_factors, "min_occurrence": min_occurrence, } H2OJob(H2OConnection.post_json("Interaction", **parms), "Interactions").poll() return get_frame(parms["dest"])
def _model_build(x, y, vx, vy, algo, offsets, weights, fold_column, kwargs): if x is None: raise ValueError("Missing features") x = _check_frame(x, y, y) vx = _check_frame(vx, vy, y) if offsets is not None: x, vx = _check_col(x, vx, kwargs["validation_frame"], offsets) if weights is not None: x, vx = _check_col(x, vx, kwargs["validation_frame"], weights) if fold_column is not None: x, vx = _check_col(x, vx, kwargs["validation_frame"], fold_column) kwargs['training_frame'] = x.frame_id if vx is not None: kwargs['validation_frame'] = vx.frame_id if y is not None: kwargs['response_column'] = y.names[0] kwargs = dict([ (k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None ]) do_future = kwargs.pop("do_future") if "do_future" in kwargs else False future_model = H2OModelFuture( H2OJob(H2OConnection.post_json("ModelBuilders/" + algo, **kwargs), job_type=(algo + " Model Build")), x) return future_model if do_future else _resolve_model( future_model, **kwargs)
def _model_build(x,y,validation_x,validation_y,algo_url,kwargs): # Basic sanity checking if algo_url == "autoencoder": if "autoencoder" in kwargs.keys(): if kwargs["autoencoder"]: if y: raise ValueError("`y` should not be specified for autoencoder, remove `y` input.") algo_url="deeplearning" if not x: raise ValueError("Missing features") x = _check_frame(x,y,y) if validation_x: validation_x = _check_frame(validation_x,validation_y,y) # Send frame descriptions to H2O cluster train_key = x.send_frame() kwargs['training_frame']=train_key if validation_x is not None: valid_key = validation_x.send_frame() kwargs['validation_frame']=valid_key if y: kwargs['response_column']=y._name kwargs = dict([(k, kwargs[k]) for k in kwargs if kwargs[k] is not None]) # launch the job and poll job = H2OJob(H2OConnection.post_json("ModelBuilders/"+algo_url, **kwargs), job_type=(algo_url+" Model Build")).poll() model_json = H2OConnection.get_json("Models/"+job.dest_key)["models"][0] model_type = model_json["output"]["model_category"] if model_type=="Binomial": from model.binomial import H2OBinomialModel model = H2OBinomialModel(job.dest_key,model_json) elif model_type=="Clustering": from model.clustering import H2OClusteringModel model = H2OClusteringModel(job.dest_key,model_json) elif model_type=="Regression": from model.regression import H2ORegressionModel model = H2ORegressionModel(job.dest_key,model_json) elif model_type=="Multinomial": from model.multinomial import H2OMultinomialModel model = H2OMultinomialModel(job.dest_key,model_json) elif model_type=="AutoEncoder": from model.autoencoder import H2OAutoEncoderModel model = H2OAutoEncoderModel(job.dest_key,model_json) else: print model_type raise NotImplementedError # Cleanup h2o.remove(train_key) if validation_x: h2o.remove(valid_key) return model
def parse(setup, h2o_name, first_line_is_header=(-1, 0, 1)): """ Trigger a parse; blocking; removeFrame just keep the Vecs. :param setup: The result of calling parse_setup. :param h2o_name: The name of the H2O Frame on the back end. :param first_line_is_header: -1 means data, 0 means guess, 1 means header. :return: A new parsed object """ # Parse parameters (None values provided by setup) p = { 'destination_frame': h2o_name, 'parse_type': None, 'separator': None, 'single_quotes': None, 'check_header': None, 'number_columns': None, 'chunk_size': None, 'delete_on_done': True, 'blocking': True, 'remove_frame': True } if isinstance(first_line_is_header, tuple): first_line_is_header = setup["check_header"] if setup["column_names"]: setup["column_names"] = [ _quoted(name) for name in setup["column_names"] ] p["column_names"] = None if setup["column_types"]: setup["column_types"] = [ _quoted(name) for name in setup["column_types"] ] p["column_types"] = None if setup["na_strings"]: setup["na_strings"] = [[_quoted(na) for na in col] if col is not None else [] for col in setup["na_strings"]] p["na_strings"] = None # update the parse parameters with the parse_setup values p.update({k: v for k, v in setup.iteritems() if k in p}) p["check_header"] = first_line_is_header # Extract only 'name' from each src in the array of srcs p['source_frames'] = [ _quoted(src['name']) for src in setup['source_frames'] ] # Request blocking parse j = H2OJob(H2OConnection.post_json(url_suffix="Parse", **p), "Parse").poll() return j.jobs
def export_file(frame,path,force=False): """ Export a given H2OFrame to a path on the machine this python session is currently connected to. To view the current session, call h2o.cluster_info(). :param frame: The Frame to save to disk. :param path: The path to the save point on disk. :param force: Overwrite any preexisting file with the same path :return: None """ H2OJob(H2OConnection.post_json("Frames/"+frame._id+"/export/"+path+"/overwrite/"+("true" if force else "false")), "Export File").poll()
def create_frame(id = None, rows = 10000, cols = 10, randomize = True, value = 0, real_range = 100, categorical_fraction = 0.2, factors = 100, integer_fraction = 0.2, integer_range = 100, binary_fraction = 0.1, binary_ones_fraction = 0.02, missing_fraction = 0.01, response_factors = 2, has_response = False, seed=None): """ Data Frame Creation in H2O. Creates a data frame in H2O with real-valued, categorical, integer, and binary columns specified by the user. :param id: A string indicating the destination key. If empty, this will be auto-generated by H2O. :param rows: The number of rows of data to generate. :param cols: The number of columns of data to generate. Excludes the response column if has_response == True}. :param randomize: A logical value indicating whether data values should be randomly generated. This must be TRUE if either categorical_fraction or integer_fraction is non-zero. :param value: If randomize == FALSE, then all real-valued entries will be set to this value. :param real_range: The range of randomly generated real values. :param categorical_fraction: The fraction of total columns that are categorical. :param factors: The number of (unique) factor levels in each categorical column. :param integer_fraction: The fraction of total columns that are integer-valued. :param integer_range: The range of randomly generated integer values. :param binary_fraction: The fraction of total columns that are binary-valued. :param binary_ones_fraction: The fraction of values in a binary column that are set to 1. :param missing_fraction: The fraction of total entries in the data frame that are set to NA. :param response_factors: If has_response == TRUE, then this is the number of factor levels in the response column. :param has_response: A logical value indicating whether an additional response column should be pre-pended to the final H2O data frame. If set to TRUE, the total number of columns will be cols+1. :param seed: A seed used to generate random values when randomize = TRUE. :return: the H2OFrame that was created """ parms = {"dest": _py_tmp_key() if id is None else id, "rows": rows, "cols": cols, "randomize": randomize, "value": value, "real_range": real_range, "categorical_fraction": categorical_fraction, "factors": factors, "integer_fraction": integer_fraction, "integer_range": integer_range, "binary_fraction": binary_fraction, "binary_ones_fraction": binary_ones_fraction, "missing_fraction": missing_fraction, "response_factors": response_factors, "has_response": has_response, "seed": -1 if seed is None else seed, } H2OJob(H2OConnection.post_json("CreateFrame", **parms), "Create Frame").poll() return get_frame(parms["dest"])
def _model_build(x, y, validation_x, validation_y, algo_url, kwargs): # Basic sanity checking if algo_url == "autoencoder": if "autoencoder" in kwargs.keys(): if kwargs["autoencoder"]: if y: raise ValueError( "`y` should not be specified for autoencoder, remove `y` input." ) algo_url = "deeplearning" if not x: raise ValueError("Missing features") x = _check_frame(x, y, y) if validation_x is not None: validation_x = _check_frame(validation_x, validation_y, y) if "weights_column" in kwargs.keys(): x, validation_x = _add_col_to_x_and_validation_x( kwargs["weights_column"], x, validation_x, kwargs) if "offset_column" in kwargs.keys(): x, validation_x = _add_col_to_x_and_validation_x( kwargs["offset_column"], x, validation_x, kwargs) if "fold_column" in kwargs.keys(): x, validation_x = _add_col_to_x_and_validation_x(kwargs["fold_column"], x, validation_x, kwargs, xval=True) # Send frame descriptions to H2O cluster kwargs['training_frame'] = x._id if validation_x is not None: kwargs['validation_frame'] = validation_x._id if y is not None: kwargs['response_column'] = y._col_names[0] kwargs = dict([(k, kwargs[k]._frame()._id if isinstance( kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # launch the job (only resolve the model if do_future is False) do_future = "do_future" in kwargs.keys() and kwargs["do_future"] if "do_future" in kwargs.keys(): kwargs.pop("do_future") future_model = H2OModelFuture( H2OJob(H2OConnection.post_json("ModelBuilders/" + algo_url, **kwargs), job_type=(algo_url + " Model Build")), x) if do_future: return future_model else: return _resolve_model(future_model, **kwargs)
def _model_build(x, y, validation_x, validation_y, algo_url, kwargs): # Basic sanity checking if not x: raise ValueError("Missing features") x = _check_frame(x, y, y) if validation_x: validation_x = _check_frame(validation_x, validation_y, y) # Send frame descriptions to H2O cluster train_key = x.send_frame() kwargs['training_frame'] = train_key if validation_x: valid_key = validation_x.send_frame() kwargs['validation_frame'] = valid_key if y: kwargs['response_column'] = y._name # launch the job and poll job = H2OJob(H2OConnection.post_json("ModelBuilders/" + algo_url, **kwargs), job_type=(algo_url + " Model Build")).poll() model_json = H2OConnection.get_json("Models/" + job.dest_key)["models"][0] model_type = model_json["output"]["model_category"] if model_type == "Binomial": from model.binomial import H2OBinomialModel model = H2OBinomialModel(job.dest_key, model_json) elif model_type == "Clustering": from model.clustering import H2OClusteringModel model = H2OClusteringModel(job.dest_key, model_json) elif model_type == "Regression": from model.regression import H2ORegressionModel model = H2ORegressionModel(job.dest_key, model_json) else: print model_type raise NotImplementedError # Cleanup h2o.remove(train_key) if validation_x: h2o.remove(valid_key) return model
def parse(setup, h2o_name, first_line_is_header=(-1, 0, 1)): """ Trigger a parse; blocking; removeFrame just keep the Vec keys. :param setup: The result of calling parse_setup :param h2o_name: The name of the H2O Frame on the back end. :param first_line_is_header: -1 means data, 0 means guess, 1 means header :return: Return a new parsed object """ if isinstance(first_line_is_header, tuple): first_line_is_header = 0 # Parse parameters (None values provided by setup) p = {'delete_on_done': True, 'blocking': True, 'removeFrame': True, 'hex': h2o_name, 'ncols': None, 'sep': None, 'pType': None, 'singleQuotes': None, 'checkHeader' : None, } if setup["columnNames"]: setup["columnNames"] = [_quoted(name) for name in setup["columnNames"]] p["columnNames"] = None # update the parse parameters with the parse_setup values p.update({k: v for k, v in setup.iteritems() if k in p}) p["checkHeader"] = first_line_is_header # Extract only 'name' from each src in the array of srcs p['srcs'] = [_quoted(src['name']) for src in setup['srcs']] # Request blocking parse j = H2OJob(H2OConnection.post_json(url_suffix="Parse", params=p), "Parse").poll() return j.jobs