Example #1
0
File: h2o.py Project: moidin/h2o-3
def interaction(data, factors, pairwise, max_factors, min_occurrence, destination_frame=None):
  """
  Categorical Interaction Feature Creation in H2O.
  Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by
  the user.

  :param data: the H2OFrame that holds the target categorical columns.
  :param factors: factors Factor columns (either indices or column names).
  :param pairwise: Whether to create pairwise interactions between factors (otherwise create one
  higher-order interaction). Only applicable if there are 3 or more factors.
  :param max_factors: Max. number of factor levels in pair-wise interaction terms (if enforced, one extra catch-all
  factor will be made)
  :param min_occurrence: Min. occurrence threshold for factor levels in pair-wise interaction terms
  :param destination_frame: A string indicating the destination key. If empty, this will be auto-generated by H2O.
  :return: H2OFrame
  """
  data._eager()
  factors = [data.names()[n] if isinstance(n,int) else n for n in factors]
  parms = {"dest": _py_tmp_key() if destination_frame is None else destination_frame,
           "source_frame": data._id,
           "factor_columns": [_quoted(f) for f in factors],
           "pairwise": pairwise,
           "max_factors": max_factors,
           "min_occurrence": min_occurrence,
           }
  H2OJob(H2OConnection.post_json("Interaction", **parms), "Interactions").poll()
  return get_frame(parms["dest"])
Example #2
0
def _model_build(x, y, vx, vy, algo, offsets, weights, fold_column, kwargs):
    if x is None: raise ValueError("Missing features")
    x = _check_frame(x, y, y)
    vx = _check_frame(vx, vy, y)
    if offsets is not None:
        x, vx = _check_col(x, vx, kwargs["validation_frame"], offsets)
    if weights is not None:
        x, vx = _check_col(x, vx, kwargs["validation_frame"], weights)
    if fold_column is not None:
        x, vx = _check_col(x, vx, kwargs["validation_frame"], fold_column)

    kwargs['training_frame'] = x.frame_id
    if vx is not None: kwargs['validation_frame'] = vx.frame_id
    if y is not None: kwargs['response_column'] = y.names[0]

    kwargs = dict([
        (k,
         kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k])
        for k in kwargs if kwargs[k] is not None
    ])

    do_future = kwargs.pop("do_future") if "do_future" in kwargs else False
    future_model = H2OModelFuture(
        H2OJob(H2OConnection.post_json("ModelBuilders/" + algo, **kwargs),
               job_type=(algo + " Model Build")), x)
    return future_model if do_future else _resolve_model(
        future_model, **kwargs)
def _model_build(x,y,validation_x,validation_y,algo_url,kwargs):
  # Basic sanity checking
  if algo_url == "autoencoder":
    if "autoencoder" in kwargs.keys():
      if kwargs["autoencoder"]:
        if y:
          raise ValueError("`y` should not be specified for autoencoder, remove `y` input.")
        algo_url="deeplearning"
  if not x:  raise ValueError("Missing features")
  x = _check_frame(x,y,y)
  if validation_x:
    validation_x = _check_frame(validation_x,validation_y,y)

  # Send frame descriptions to H2O cluster
  train_key = x.send_frame()
  kwargs['training_frame']=train_key
  if validation_x is not None:
    valid_key = validation_x.send_frame()
    kwargs['validation_frame']=valid_key

  if y:
    kwargs['response_column']=y._name

  kwargs = dict([(k, kwargs[k]) for k in kwargs if kwargs[k] is not None])

  # launch the job and poll
  job = H2OJob(H2OConnection.post_json("ModelBuilders/"+algo_url, **kwargs), job_type=(algo_url+" Model Build")).poll()
  model_json = H2OConnection.get_json("Models/"+job.dest_key)["models"][0]
  model_type = model_json["output"]["model_category"]
  if model_type=="Binomial":
    from model.binomial import H2OBinomialModel
    model = H2OBinomialModel(job.dest_key,model_json)

  elif model_type=="Clustering":
    from model.clustering import H2OClusteringModel
    model = H2OClusteringModel(job.dest_key,model_json)

  elif model_type=="Regression":
    from model.regression import H2ORegressionModel
    model = H2ORegressionModel(job.dest_key,model_json)

  elif model_type=="Multinomial":
    from model.multinomial import H2OMultinomialModel
    model = H2OMultinomialModel(job.dest_key,model_json)

  elif model_type=="AutoEncoder":
    from model.autoencoder import H2OAutoEncoderModel
    model = H2OAutoEncoderModel(job.dest_key,model_json)

  else:
    print model_type
    raise NotImplementedError

  # Cleanup
  h2o.remove(train_key)
  if validation_x:
    h2o.remove(valid_key)

  return model
Example #4
0
def parse(setup, h2o_name, first_line_is_header=(-1, 0, 1)):
    """
  Trigger a parse; blocking; removeFrame just keep the Vecs.

  :param setup: The result of calling parse_setup.
  :param h2o_name: The name of the H2O Frame on the back end.
  :param first_line_is_header: -1 means data, 0 means guess, 1 means header.
  :return: A new parsed object
  """
    # Parse parameters (None values provided by setup)
    p = {
        'destination_frame': h2o_name,
        'parse_type': None,
        'separator': None,
        'single_quotes': None,
        'check_header': None,
        'number_columns': None,
        'chunk_size': None,
        'delete_on_done': True,
        'blocking': True,
        'remove_frame': True
    }
    if isinstance(first_line_is_header, tuple):
        first_line_is_header = setup["check_header"]

    if setup["column_names"]:
        setup["column_names"] = [
            _quoted(name) for name in setup["column_names"]
        ]
        p["column_names"] = None

    if setup["column_types"]:
        setup["column_types"] = [
            _quoted(name) for name in setup["column_types"]
        ]
        p["column_types"] = None

    if setup["na_strings"]:
        setup["na_strings"] = [[_quoted(na)
                                for na in col] if col is not None else []
                               for col in setup["na_strings"]]
        p["na_strings"] = None

    # update the parse parameters with the parse_setup values
    p.update({k: v for k, v in setup.iteritems() if k in p})

    p["check_header"] = first_line_is_header

    # Extract only 'name' from each src in the array of srcs
    p['source_frames'] = [
        _quoted(src['name']) for src in setup['source_frames']
    ]

    # Request blocking parse
    j = H2OJob(H2OConnection.post_json(url_suffix="Parse", **p),
               "Parse").poll()
    return j.jobs
Example #5
0
File: h2o.py Project: moidin/h2o-3
def export_file(frame,path,force=False):
  """
  Export a given H2OFrame to a path on the machine this python session is currently connected to. To view the current session, call h2o.cluster_info().

  :param frame: The Frame to save to disk.
  :param path: The path to the save point on disk.
  :param force: Overwrite any preexisting file with the same path
  :return: None
  """
  H2OJob(H2OConnection.post_json("Frames/"+frame._id+"/export/"+path+"/overwrite/"+("true" if force else "false")), "Export File").poll()
Example #6
0
File: h2o.py Project: moidin/h2o-3
def create_frame(id = None, rows = 10000, cols = 10, randomize = True, value = 0, real_range = 100,
                 categorical_fraction = 0.2, factors = 100, integer_fraction = 0.2, integer_range = 100,
                 binary_fraction = 0.1, binary_ones_fraction = 0.02, missing_fraction = 0.01, response_factors = 2,
                 has_response = False, seed=None):
  """
  Data Frame Creation in H2O.
  Creates a data frame in H2O with real-valued, categorical, integer, and binary columns specified by the user.

  :param id: A string indicating the destination key. If empty, this will be auto-generated by H2O.
  :param rows: The number of rows of data to generate.
  :param cols: The number of columns of data to generate. Excludes the response column if has_response == True}.
  :param randomize: A logical value indicating whether data values should be randomly generated. This must be TRUE if
  either categorical_fraction or integer_fraction is non-zero.
  :param value: If randomize == FALSE, then all real-valued entries will be set to this value.
  :param real_range: The range of randomly generated real values.
  :param categorical_fraction:  The fraction of total columns that are categorical.
  :param factors: The number of (unique) factor levels in each categorical column.
  :param integer_fraction: The fraction of total columns that are integer-valued.
  :param integer_range: The range of randomly generated integer values.
  :param binary_fraction: The fraction of total columns that are binary-valued.
  :param binary_ones_fraction: The fraction of values in a binary column that are set to 1.
  :param missing_fraction: The fraction of total entries in the data frame that are set to NA.
  :param response_factors: If has_response == TRUE, then this is the number of factor levels in the response column.
  :param has_response: A logical value indicating whether an additional response column should be pre-pended to the
  final H2O data frame. If set to TRUE, the total number of columns will be cols+1.
  :param seed: A seed used to generate random values when randomize = TRUE.
  :return: the H2OFrame that was created
  """
  parms = {"dest": _py_tmp_key() if id is None else id,
           "rows": rows,
           "cols": cols,
           "randomize": randomize,
           "value": value,
           "real_range": real_range,
           "categorical_fraction": categorical_fraction,
           "factors": factors,
           "integer_fraction": integer_fraction,
           "integer_range": integer_range,
           "binary_fraction": binary_fraction,
           "binary_ones_fraction": binary_ones_fraction,
           "missing_fraction": missing_fraction,
           "response_factors": response_factors,
           "has_response": has_response,
           "seed": -1 if seed is None else seed,
           }
  H2OJob(H2OConnection.post_json("CreateFrame", **parms), "Create Frame").poll()
  return get_frame(parms["dest"])
Example #7
0
def _model_build(x, y, validation_x, validation_y, algo_url, kwargs):
    # Basic sanity checking
    if algo_url == "autoencoder":
        if "autoencoder" in kwargs.keys():
            if kwargs["autoencoder"]:
                if y:
                    raise ValueError(
                        "`y` should not be specified for autoencoder, remove `y` input."
                    )
                algo_url = "deeplearning"
    if not x: raise ValueError("Missing features")
    x = _check_frame(x, y, y)
    if validation_x is not None:
        validation_x = _check_frame(validation_x, validation_y, y)

    if "weights_column" in kwargs.keys():
        x, validation_x = _add_col_to_x_and_validation_x(
            kwargs["weights_column"], x, validation_x, kwargs)
    if "offset_column" in kwargs.keys():
        x, validation_x = _add_col_to_x_and_validation_x(
            kwargs["offset_column"], x, validation_x, kwargs)
    if "fold_column" in kwargs.keys():
        x, validation_x = _add_col_to_x_and_validation_x(kwargs["fold_column"],
                                                         x,
                                                         validation_x,
                                                         kwargs,
                                                         xval=True)

    # Send frame descriptions to H2O cluster
    kwargs['training_frame'] = x._id
    if validation_x is not None: kwargs['validation_frame'] = validation_x._id

    if y is not None: kwargs['response_column'] = y._col_names[0]

    kwargs = dict([(k, kwargs[k]._frame()._id if isinstance(
        kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs
                   if kwargs[k] is not None])

    # launch the job (only resolve the model if do_future is False)
    do_future = "do_future" in kwargs.keys() and kwargs["do_future"]
    if "do_future" in kwargs.keys(): kwargs.pop("do_future")
    future_model = H2OModelFuture(
        H2OJob(H2OConnection.post_json("ModelBuilders/" + algo_url, **kwargs),
               job_type=(algo_url + " Model Build")), x)
    if do_future: return future_model
    else: return _resolve_model(future_model, **kwargs)
Example #8
0
def _model_build(x, y, validation_x, validation_y, algo_url, kwargs):
    # Basic sanity checking
    if not x: raise ValueError("Missing features")
    x = _check_frame(x, y, y)
    if validation_x:
        validation_x = _check_frame(validation_x, validation_y, y)

    # Send frame descriptions to H2O cluster
    train_key = x.send_frame()
    kwargs['training_frame'] = train_key
    if validation_x:
        valid_key = validation_x.send_frame()
        kwargs['validation_frame'] = valid_key

    if y:
        kwargs['response_column'] = y._name

    # launch the job and poll
    job = H2OJob(H2OConnection.post_json("ModelBuilders/" + algo_url,
                                         **kwargs),
                 job_type=(algo_url + " Model Build")).poll()
    model_json = H2OConnection.get_json("Models/" + job.dest_key)["models"][0]
    model_type = model_json["output"]["model_category"]
    if model_type == "Binomial":
        from model.binomial import H2OBinomialModel
        model = H2OBinomialModel(job.dest_key, model_json)

    elif model_type == "Clustering":
        from model.clustering import H2OClusteringModel
        model = H2OClusteringModel(job.dest_key, model_json)

    elif model_type == "Regression":
        from model.regression import H2ORegressionModel
        model = H2ORegressionModel(job.dest_key, model_json)

    else:
        print model_type
        raise NotImplementedError

    # Cleanup
    h2o.remove(train_key)
    if validation_x:
        h2o.remove(valid_key)

    return model
Example #9
0
def parse(setup, h2o_name, first_line_is_header=(-1, 0, 1)):
  """
  Trigger a parse; blocking; removeFrame just keep the Vec keys.
  :param setup: The result of calling parse_setup
  :param h2o_name: The name of the H2O Frame on the back end.
  :param first_line_is_header: -1 means data, 0 means guess, 1 means header
  :return: Return a new parsed object
  """
  if isinstance(first_line_is_header, tuple):
    first_line_is_header = 0
    # Parse parameters (None values provided by setup)
    p = {'delete_on_done': True,
         'blocking': True,
         'removeFrame': True,
         'hex': h2o_name,
         'ncols': None,
         'sep': None,
         'pType': None,
         'singleQuotes': None,
         'checkHeader' : None,
        }
  if setup["columnNames"]:
    setup["columnNames"] = [_quoted(name) for name in setup["columnNames"]]
    p["columnNames"] = None

  # update the parse parameters with the parse_setup values
  p.update({k: v for k, v in setup.iteritems() if k in p})

  p["checkHeader"] = first_line_is_header

  # Extract only 'name' from each src in the array of srcs
  p['srcs'] = [_quoted(src['name']) for src in setup['srcs']]

  # Request blocking parse
  j = H2OJob(H2OConnection.post_json(url_suffix="Parse", params=p), "Parse").poll()
  return j.jobs