Esempio n. 1
0
File: h2o.py Progetto: moidin/h2o-3
def interaction(data, factors, pairwise, max_factors, min_occurrence, destination_frame=None):
  """
  Categorical Interaction Feature Creation in H2O.
  Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by
  the user.

  :param data: the H2OFrame that holds the target categorical columns.
  :param factors: factors Factor columns (either indices or column names).
  :param pairwise: Whether to create pairwise interactions between factors (otherwise create one
  higher-order interaction). Only applicable if there are 3 or more factors.
  :param max_factors: Max. number of factor levels in pair-wise interaction terms (if enforced, one extra catch-all
  factor will be made)
  :param min_occurrence: Min. occurrence threshold for factor levels in pair-wise interaction terms
  :param destination_frame: A string indicating the destination key. If empty, this will be auto-generated by H2O.
  :return: H2OFrame
  """
  data._eager()
  factors = [data.names()[n] if isinstance(n,int) else n for n in factors]
  parms = {"dest": _py_tmp_key() if destination_frame is None else destination_frame,
           "source_frame": data._id,
           "factor_columns": [_quoted(f) for f in factors],
           "pairwise": pairwise,
           "max_factors": max_factors,
           "min_occurrence": min_occurrence,
           }
  H2OJob(H2OConnection.post_json("Interaction", **parms), "Interactions").poll()
  return get_frame(parms["dest"])
Esempio n. 2
0
def interaction(data, factors, pairwise, max_factors, min_occurrence, destination_frame=None):
  """
  Categorical Interaction Feature Creation in H2O.
  Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by
  the user.

  :param data: the H2OFrame that holds the target categorical columns.
  :param factors: factors Factor columns (either indices or column names).
  :param pairwise: Whether to create pairwise interactions between factors (otherwise create one
  higher-order interaction). Only applicable if there are 3 or more factors.
  :param max_factors: Max. number of factor levels in pair-wise interaction terms (if enforced, one extra catch-all
  factor will be made)
  :param min_occurrence: Min. occurrence threshold for factor levels in pair-wise interaction terms
  :param destination_frame: A string indicating the destination key. If empty, this will be auto-generated by H2O.
  :return: H2OFrame
  """
  data._eager()
  factors = [data.names()[n] if isinstance(n,int) else n for n in factors]
  parms = {"dest": _py_tmp_key() if destination_frame is None else destination_frame,
           "source_frame": data._id,
           "factor_columns": [_quoted(f) for f in factors],
           "pairwise": pairwise,
           "max_factors": max_factors,
           "min_occurrence": min_occurrence,
           }
  H2OJob(H2OConnection.post_json("Interaction", **parms), "Interactions").poll()
  return get_frame(parms["dest"])
Esempio n. 3
0
 def _do_it(self,top):
   if not self._cache.is_empty():    # Data already computed and cached; could a "false-like" cached value
     return str(self._cache._data) if self._cache.is_scalar() else self._cache._id
   if self._cache._id is not None: return self._cache._id  # Data already computed under ID, but not cached
   # assert isinstance(self._children,tuple)
   exec_str = "({} {})".format(self._op," ".join([ExprNode._arg_to_expr(ast) for ast in self._children]))
   gc_ref_cnt = len(gc.get_referrers(self))
   if top or gc_ref_cnt >= ExprNode.MAGIC_REF_COUNT:
     self._cache._id = frame._py_tmp_key()
     exec_str = "(tmp= {} {})".format(self._cache._id, exec_str)
   return exec_str
Esempio n. 4
0
 def _do_it(self,top):
   if not self._cache.is_empty():    # Data already computed and cached; could a "false-like" cached value
     return str(self._cache._data) if self._cache.is_scalar() else self._cache._id
   if self._cache._id is not None: return self._cache._id  # Data already computed under ID, but not cached
   # assert isinstance(self._children,tuple)
   exec_str = "({} {})".format(self._op," ".join([ExprNode._arg_to_expr(ast) for ast in self._children]))
   gc_ref_cnt = len(gc.get_referrers(self))
   if top or gc_ref_cnt >= ExprNode.MAGIC_REF_COUNT:
     self._cache._id = frame._py_tmp_key()
     exec_str = "(tmp= {} {})".format(self._cache._id, exec_str)
   return exec_str
Esempio n. 5
0
def upload_file(path, destination_frame=""):
  """
  Upload a dataset at the path given from the local machine to the H2O cluster.

  :param path: A path specifying the location of the data to upload.
  :param destination_frame: The name of the H2O Frame in the H2O Cluster.
  :return: A new H2OFrame
  """
  fui = {"file": os.path.abspath(path)}
  destination_frame = _py_tmp_key() if destination_frame == "" else destination_frame
  H2OConnection.post_json(url_suffix="PostFile", file_upload_info=fui,destination_frame=destination_frame)
  return H2OFrame(raw_id=destination_frame)
Esempio n. 6
0
File: h2o.py Progetto: moidin/h2o-3
def upload_file(path, destination_frame=""):
  """
  Upload a dataset at the path given from the local machine to the H2O cluster.

  :param path: A path specifying the location of the data to upload.
  :param destination_frame: The name of the H2O Frame in the H2O Cluster.
  :return: A new H2OFrame
  """
  fui = {"file": os.path.abspath(path)}
  destination_frame = _py_tmp_key() if destination_frame == "" else destination_frame
  H2OConnection.post_json(url_suffix="PostFile", file_upload_info=fui,destination_frame=destination_frame)
  return H2OFrame(raw_id=destination_frame)
Esempio n. 7
0
 def _do_it(self,top):
   if self._data is not None:    # Data already computed and cached; could a "false-like" cached value
     return self._id if isinstance(self._data,dict) else str(self._data)
   if self._id: return self._id  # Data already computed under ID, but not cached
   # Here self._id is either None or ""
   # Build the eval expression
   assert isinstance(self._ast,tuple)
   exec_str = "("+self._op+" "+" ".join([ExprNode._arg_to_expr(ast) for ast in self._ast])+")"
   gc_ref_cnt = len(gc.get_referrers(self))
   #print(gc_ref_cnt,self._op)
   if top or gc_ref_cnt >= ExprNode.MAGIC_REF_COUNT:
     self._id = frame._py_tmp_key()
     exec_str = "(tmp= "+self._id+" "+exec_str+")"
   return exec_str
Esempio n. 8
0
def create_frame(id = None, rows = 10000, cols = 10, randomize = True, value = 0, real_range = 100,
                 categorical_fraction = 0.2, factors = 100, integer_fraction = 0.2, integer_range = 100,
                 binary_fraction = 0.1, binary_ones_fraction = 0.02, missing_fraction = 0.01, response_factors = 2,
                 has_response = False, seed=None):
  """
  Data Frame Creation in H2O.
  Creates a data frame in H2O with real-valued, categorical, integer, and binary columns specified by the user.

  :param id: A string indicating the destination key. If empty, this will be auto-generated by H2O.
  :param rows: The number of rows of data to generate.
  :param cols: The number of columns of data to generate. Excludes the response column if has_response == True}.
  :param randomize: A logical value indicating whether data values should be randomly generated. This must be TRUE if
  either categorical_fraction or integer_fraction is non-zero.
  :param value: If randomize == FALSE, then all real-valued entries will be set to this value.
  :param real_range: The range of randomly generated real values.
  :param categorical_fraction:  The fraction of total columns that are categorical.
  :param factors: The number of (unique) factor levels in each categorical column.
  :param integer_fraction: The fraction of total columns that are integer-valued.
  :param integer_range: The range of randomly generated integer values.
  :param binary_fraction: The fraction of total columns that are binary-valued.
  :param binary_ones_fraction: The fraction of values in a binary column that are set to 1.
  :param missing_fraction: The fraction of total entries in the data frame that are set to NA.
  :param response_factors: If has_response == TRUE, then this is the number of factor levels in the response column.
  :param has_response: A logical value indicating whether an additional response column should be pre-pended to the
  final H2O data frame. If set to TRUE, the total number of columns will be cols+1.
  :param seed: A seed used to generate random values when randomize = TRUE.
  :return: the H2OFrame that was created
  """
  parms = {"dest": _py_tmp_key() if id is None else id,
           "rows": rows,
           "cols": cols,
           "randomize": randomize,
           "value": value,
           "real_range": real_range,
           "categorical_fraction": categorical_fraction,
           "factors": factors,
           "integer_fraction": integer_fraction,
           "integer_range": integer_range,
           "binary_fraction": binary_fraction,
           "binary_ones_fraction": binary_ones_fraction,
           "missing_fraction": missing_fraction,
           "response_factors": response_factors,
           "has_response": has_response,
           "seed": -1 if seed is None else seed,
           }
  H2OJob(H2OConnection.post_json("CreateFrame", **parms), "Create Frame").poll()
  return get_frame(parms["dest"])
Esempio n. 9
0
File: h2o.py Progetto: moidin/h2o-3
def create_frame(id = None, rows = 10000, cols = 10, randomize = True, value = 0, real_range = 100,
                 categorical_fraction = 0.2, factors = 100, integer_fraction = 0.2, integer_range = 100,
                 binary_fraction = 0.1, binary_ones_fraction = 0.02, missing_fraction = 0.01, response_factors = 2,
                 has_response = False, seed=None):
  """
  Data Frame Creation in H2O.
  Creates a data frame in H2O with real-valued, categorical, integer, and binary columns specified by the user.

  :param id: A string indicating the destination key. If empty, this will be auto-generated by H2O.
  :param rows: The number of rows of data to generate.
  :param cols: The number of columns of data to generate. Excludes the response column if has_response == True}.
  :param randomize: A logical value indicating whether data values should be randomly generated. This must be TRUE if
  either categorical_fraction or integer_fraction is non-zero.
  :param value: If randomize == FALSE, then all real-valued entries will be set to this value.
  :param real_range: The range of randomly generated real values.
  :param categorical_fraction:  The fraction of total columns that are categorical.
  :param factors: The number of (unique) factor levels in each categorical column.
  :param integer_fraction: The fraction of total columns that are integer-valued.
  :param integer_range: The range of randomly generated integer values.
  :param binary_fraction: The fraction of total columns that are binary-valued.
  :param binary_ones_fraction: The fraction of values in a binary column that are set to 1.
  :param missing_fraction: The fraction of total entries in the data frame that are set to NA.
  :param response_factors: If has_response == TRUE, then this is the number of factor levels in the response column.
  :param has_response: A logical value indicating whether an additional response column should be pre-pended to the
  final H2O data frame. If set to TRUE, the total number of columns will be cols+1.
  :param seed: A seed used to generate random values when randomize = TRUE.
  :return: the H2OFrame that was created
  """
  parms = {"dest": _py_tmp_key() if id is None else id,
           "rows": rows,
           "cols": cols,
           "randomize": randomize,
           "value": value,
           "real_range": real_range,
           "categorical_fraction": categorical_fraction,
           "factors": factors,
           "integer_fraction": integer_fraction,
           "integer_range": integer_range,
           "binary_fraction": binary_fraction,
           "binary_ones_fraction": binary_ones_fraction,
           "missing_fraction": missing_fraction,
           "response_factors": response_factors,
           "has_response": has_response,
           "seed": -1 if seed is None else seed,
           }
  H2OJob(H2OConnection.post_json("CreateFrame", **parms), "Create Frame").poll()
  return get_frame(parms["dest"])