def interaction(data, factors, pairwise, max_factors, min_occurrence, destination_frame=None): """ Categorical Interaction Feature Creation in H2O. Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by the user. :param data: the H2OFrame that holds the target categorical columns. :param factors: factors Factor columns (either indices or column names). :param pairwise: Whether to create pairwise interactions between factors (otherwise create one higher-order interaction). Only applicable if there are 3 or more factors. :param max_factors: Max. number of factor levels in pair-wise interaction terms (if enforced, one extra catch-all factor will be made) :param min_occurrence: Min. occurrence threshold for factor levels in pair-wise interaction terms :param destination_frame: A string indicating the destination key. If empty, this will be auto-generated by H2O. :return: H2OFrame """ data._eager() factors = [data.names()[n] if isinstance(n,int) else n for n in factors] parms = {"dest": _py_tmp_key() if destination_frame is None else destination_frame, "source_frame": data._id, "factor_columns": [_quoted(f) for f in factors], "pairwise": pairwise, "max_factors": max_factors, "min_occurrence": min_occurrence, } H2OJob(H2OConnection.post_json("Interaction", **parms), "Interactions").poll() return get_frame(parms["dest"])
def _do_it(self,top): if not self._cache.is_empty(): # Data already computed and cached; could a "false-like" cached value return str(self._cache._data) if self._cache.is_scalar() else self._cache._id if self._cache._id is not None: return self._cache._id # Data already computed under ID, but not cached # assert isinstance(self._children,tuple) exec_str = "({} {})".format(self._op," ".join([ExprNode._arg_to_expr(ast) for ast in self._children])) gc_ref_cnt = len(gc.get_referrers(self)) if top or gc_ref_cnt >= ExprNode.MAGIC_REF_COUNT: self._cache._id = frame._py_tmp_key() exec_str = "(tmp= {} {})".format(self._cache._id, exec_str) return exec_str
def upload_file(path, destination_frame=""): """ Upload a dataset at the path given from the local machine to the H2O cluster. :param path: A path specifying the location of the data to upload. :param destination_frame: The name of the H2O Frame in the H2O Cluster. :return: A new H2OFrame """ fui = {"file": os.path.abspath(path)} destination_frame = _py_tmp_key() if destination_frame == "" else destination_frame H2OConnection.post_json(url_suffix="PostFile", file_upload_info=fui,destination_frame=destination_frame) return H2OFrame(raw_id=destination_frame)
def _do_it(self,top): if self._data is not None: # Data already computed and cached; could a "false-like" cached value return self._id if isinstance(self._data,dict) else str(self._data) if self._id: return self._id # Data already computed under ID, but not cached # Here self._id is either None or "" # Build the eval expression assert isinstance(self._ast,tuple) exec_str = "("+self._op+" "+" ".join([ExprNode._arg_to_expr(ast) for ast in self._ast])+")" gc_ref_cnt = len(gc.get_referrers(self)) #print(gc_ref_cnt,self._op) if top or gc_ref_cnt >= ExprNode.MAGIC_REF_COUNT: self._id = frame._py_tmp_key() exec_str = "(tmp= "+self._id+" "+exec_str+")" return exec_str
def create_frame(id = None, rows = 10000, cols = 10, randomize = True, value = 0, real_range = 100, categorical_fraction = 0.2, factors = 100, integer_fraction = 0.2, integer_range = 100, binary_fraction = 0.1, binary_ones_fraction = 0.02, missing_fraction = 0.01, response_factors = 2, has_response = False, seed=None): """ Data Frame Creation in H2O. Creates a data frame in H2O with real-valued, categorical, integer, and binary columns specified by the user. :param id: A string indicating the destination key. If empty, this will be auto-generated by H2O. :param rows: The number of rows of data to generate. :param cols: The number of columns of data to generate. Excludes the response column if has_response == True}. :param randomize: A logical value indicating whether data values should be randomly generated. This must be TRUE if either categorical_fraction or integer_fraction is non-zero. :param value: If randomize == FALSE, then all real-valued entries will be set to this value. :param real_range: The range of randomly generated real values. :param categorical_fraction: The fraction of total columns that are categorical. :param factors: The number of (unique) factor levels in each categorical column. :param integer_fraction: The fraction of total columns that are integer-valued. :param integer_range: The range of randomly generated integer values. :param binary_fraction: The fraction of total columns that are binary-valued. :param binary_ones_fraction: The fraction of values in a binary column that are set to 1. :param missing_fraction: The fraction of total entries in the data frame that are set to NA. :param response_factors: If has_response == TRUE, then this is the number of factor levels in the response column. :param has_response: A logical value indicating whether an additional response column should be pre-pended to the final H2O data frame. If set to TRUE, the total number of columns will be cols+1. :param seed: A seed used to generate random values when randomize = TRUE. :return: the H2OFrame that was created """ parms = {"dest": _py_tmp_key() if id is None else id, "rows": rows, "cols": cols, "randomize": randomize, "value": value, "real_range": real_range, "categorical_fraction": categorical_fraction, "factors": factors, "integer_fraction": integer_fraction, "integer_range": integer_range, "binary_fraction": binary_fraction, "binary_ones_fraction": binary_ones_fraction, "missing_fraction": missing_fraction, "response_factors": response_factors, "has_response": has_response, "seed": -1 if seed is None else seed, } H2OJob(H2OConnection.post_json("CreateFrame", **parms), "Create Frame").poll() return get_frame(parms["dest"])