Example #1
0
  def ddply(self,cols,fun):
    """
    :param cols: Column names used to control grouping
    :param fun: Function to execute on each group.  Right now limited to textual Rapids expression
    :return: New frame with 1 row per-group, of results from 'fun'
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    # Confirm all names present in dataset; collect column indices
    rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")"

    # Eagerly eval and send the cbind'd frame over
    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()
    expr = "(= !{} (h2o.ddply %{} {} {}))".format(tmp_key,key,rapids_series,fun)
    h2o.rapids(expr) # ddply in h2o
    # Remove h2o temp frame after ddply
    h2o.remove(key)
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key) # Fetch the frame as JSON
    fr = j['frames'][0]    # Just the first (only) frame
    rows = fr['rows']      # Row count
    veckeys = fr['vec_ids']# List of h2o vec keys
    cols = fr['columns']   # List of columns
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
Example #2
0
  def predict(self, test_data):
    """
    Predict on a dataset.

    :param test_data: Data to be predicted on.
    :return: A new H2OFrame filled with predictions.
    """
    if not test_data: raise ValueError("Must specify test data")
    # cbind the test_data vecs together and produce a temp key
    test_data_key = H2OFrame.send_frame(test_data)
    # get the predictions
    # this job call is blocking
    j = H2OConnection.post_json("Predictions/models/" + self._key + "/frames/" + test_data_key)
    # retrieve the prediction frame
    prediction_frame_key = j["model_metrics"][0]["predictions"]["frame_id"]["name"]
    # get the actual frame meta dta
    pred_frame_meta = h2o.frame(prediction_frame_key)["frames"][0]
    # collect the vec_ids
    vec_ids = pred_frame_meta["vec_ids"]
    # get the number of rows
    rows = pred_frame_meta["rows"]
    # get the column names
    cols = [col["label"] for col in pred_frame_meta["columns"]]
    # create a set of H2OVec objects
    vecs = H2OVec.new_vecs(zip(cols, vec_ids), rows)
    # toast the cbound frame
    h2o.remove(test_data_key)
    # return a new H2OFrame object
    return H2OFrame(vecs=vecs)
Example #3
0
  def tail(self, rows=10, cols=200, **kwargs):
    """
    Analgous to R's `tail` call on a data.frame. Display a digestible chunk of the H2OFrame starting from the end.

    :param rows: Number of rows to display.
    :param cols: Number of columns to display.
    :param kwargs: Extra arguments passed from other methods.
    :return: None
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    nrows = min(self.nrow(), rows)
    ncols = min(self.ncol(), cols)
    colnames = self.names()[0:ncols]

    exprs = [self[c][(self.nrow()-nrows):(self.nrow())] for c in range(ncols)]
    print "Last", str(nrows), "rows and first", str(ncols), "columns: "
    if nrows != 1:
      fr = H2OFrame.py_tmp_key()
      cbind = "(= !" + fr + " (cbind %"
      cbind += " %".join([expr.eager() for expr in exprs]) + "))"
      res = h2o.rapids(cbind)
      h2o.remove(fr)
      tail_rows = [range(self.nrow()-nrows+1, self.nrow() + 1, 1)]
      tail_rows += [rows[0:nrows] for rows in res["head"][0:ncols]]
      tail = zip(*tail_rows)
      print tabulate.tabulate(tail, headers=["Row ID"] + colnames)
    else:
      print tabulate.tabulate([[self.nrow()] + [expr.eager() for expr in exprs]], headers=["Row ID"] + colnames)
    print
def glrm_iris():
    print("Importing iris_wheader.csv data...")
    irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    irisH2O.describe()

    for trans in ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"]:
        rank = random.randint(1, 7)
        gx = random.uniform(0, 1)
        gy = random.uniform(0, 1)

        print(
            "H2O GLRM with rank k = "
            + str(rank)
            + ", gamma_x = "
            + str(gx)
            + ", gamma_y = "
            + str(gy)
            + ", transform = "
            + trans
        )
        glrm_h2o = h2o.glrm(x=irisH2O, k=rank, loss="Quadratic", gamma_x=gx, gamma_y=gy, transform=trans)
        glrm_h2o.show()

        print("Impute original data from XY decomposition")
        pred_h2o = glrm_h2o.predict(irisH2O)
        pred_h2o.describe()
        h2o.remove(glrm_h2o._model_json["output"]["representation_name"])
def test_load_glrm():
  print("Importing iris_wheader.csv data...")
  irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  irisH2O.describe()

  g_model = H2OGeneralizedLowRankEstimator(k=3)
  g_model.train(x=irisH2O.names, training_frame=irisH2O)
  yarch_old = g_model.archetypes()
  x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"])
  predOld = g_model.predict(irisH2O)
  TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results"))

  try:
    TMPDIR = pyunit_utils.locate("results")    # find directory path to results folder
  except:
    os.makedirs(TMPDIR)
  h2o.save_model(g_model, path=TMPDIR, force=True)       # save model
  full_path_filename = os.path.join(TMPDIR, g_model._id)

  h2o.remove(g_model)
  model_reloaded = h2o.load_model(full_path_filename)
  pred = model_reloaded.predict(irisH2O)
  yarch = model_reloaded.archetypes()
  x = h2o.get_frame(model_reloaded._model_json["output"]["representation_name"])

  # assert difference between old and new are close, archetypes should be the same
  pyunit_utils.compare_frames_local(x, x_old, tol=1e-6)
  pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1)
  for k in range(3):
    pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps = 1e-4, tolerance=1e-10)

  print("glrm model successfully loaded...")
Example #6
0
  def head(self, rows=10, cols=200, **kwargs):
    """
    Analgous to R's `head` call on a data.frame. Display a digestible chunk of the H2OFrame starting from the beginning.

    :param rows: Number of rows to display.
    :param cols: Number of columns to display.
    :param kwargs: Extra arguments passed from other methods.
    :return: None
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    nrows = min(self.nrow(), rows)
    ncols = min(self.ncol(), cols)
    colnames = self.names()[0:ncols]

    fr = H2OFrame.py_tmp_key()
    cbind = "(= !" + fr + " (cbind %"
    cbind += " %".join([vec._expr.eager() for vec in self]) + "))"
    res = h2o.rapids(cbind)
    h2o.remove(fr)
    head_rows = [range(1, nrows + 1, 1)]
    head_rows += [rows[0:nrows] for rows in res["head"][0:ncols]]
    head = zip(*head_rows)
    print "First", str(nrows), "rows and first", str(ncols), "columns: "
    print tabulate.tabulate(head, headers=["Row ID"] + colnames)
    print
Example #7
0
  def describe(self):
    """
    Generate an in-depth description of this H2OFrame.

    The description is a tabular print of the type, min, max, sigma, number of zeros,
    and number of missing elements for each H2OVec in this H2OFrame.

    :return: None (print to stdout)
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    print "Rows:", len(self._vecs[0]), "Cols:", len(self)
    headers = [vec._name for vec in self._vecs]
    table = [
      self._row('type', None),
      self._row('mins', 0),
      self._row('mean', None),
      self._row('maxs', 0),
      self._row('sigma', None),
      self._row('zero_count', None),
      self._row('missing_count', None)
    ]

    chunk_summary_tmp_key = H2OFrame.send_frame(self)

    chunk_summary = h2o.frame(chunk_summary_tmp_key)["frames"][0]["chunk_summary"]

    h2o.remove(chunk_summary_tmp_key)

    print tabulate.tabulate(table, headers)
    print
    print chunk_summary
    print
Example #8
0
  def quantile(self, prob=None, combine_method="interpolate"):
    """
    Compute quantiles over a given H2OFrame.

    :param prob: A list of probabilties, default is [0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]. You may provide any sequence of any length.
    :param combine_method: For even samples, how to combine quantiles. Should be one of ["interpolate", "average", "low", "hi"]
    :return: an H2OFrame containing the quantiles and probabilities.
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    if len(self) == 0: return self
    if not prob: prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]
    if not isinstance(prob, list): raise ValueError("prob must be a list")
    probs = "(dlist #"+" #".join([str(p) for p in prob])+")"
    if combine_method not in ["interpolate","average","low","high"]:
      raise ValueError("combine_method must be one of: [" + ",".join(["interpolate","average","low","high"])+"]")

    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()
    expr = "(= !{} (quantile '{}' {} '{}'".format(tmp_key,key,probs,combine_method)
    h2o.rapids(expr)
    # Remove h2o temp frame after groupby
    h2o.remove(key)
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key)
    fr = j['frames'][0]       # Just the first (only) frame
    rows = fr['rows']         # Row count
    veckeys = fr['vec_ids']  # List of h2o vec keys
    cols = fr['columns']      # List of columns
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def cumsumminprodmax():
    # TODO PUBDEV-1748
    foo = h2o.H2OFrame(python_obj=[[x, y] for x, y in zip(range(10), range(9, -1, -1))])
    foo.show()

    cumsum1 = foo[0].cumsum()
    cummin1 = foo[0].cummin()
    cumprod1 = foo[1:10, 0].cumprod()
    cummax1 = foo[0].cummax()

    cumsum2 = foo[1].cumsum()
    cummin2 = foo[1].cummin()
    cumprod2 = foo[0:9, 1].cumprod()
    cummax2 = foo[1].cummax()

    assert cumsum1[9, 0] == cumsum2[9, 0] == 45, "expected cumsums to be 45, but got {0} and {1}".format(
        cumsum1[9, 0], cumsum2[9, 0]
    )

    assert cummin1[9, 0] == cummin2[9, 0] == 0, "expected cummin to be 0, but got {0} and {1}".format(
        cummin1[9, 0], cummin2[9, 0]
    )

    assert cummax1[9, 0] == cummax2[9, 0] == 9, "expected cummin to be 9, but got {0} and {1}".format(
        cummin1[9, 0], cummin2[9, 0]
    )

    cumprod1.show()
    print cumprod1.dim
    assert cumprod1[8, 0] == cumprod2[8, 0] == 362880, "expected cumprod to be 362880, but got {0} and " "{1}".format(
        cumprod1[8, 0], cumprod2[8, 0]
    )

    h2o.remove(foo)
Example #10
0
  def model_performance(self, test_data=None, train=False, valid=False):
    """
    Generate model metrics for this model on test_data.

    :param test_data: Data set for which model metrics shall be computed against. Both train and valid arguments are ignored if test_data is not None.
    :param train: Report the training metrics for the model. If the test_data is the training data, the training metrics are returned.
    :param valid: Report the validation metrics for the model. If train and valid are True, then it defaults to True.
    :return: An object of class H2OModelMetrics.
    """
    if test_data is None:
      if not train and not valid:
        train = True  # default to train

      if train:
        return self._model_json["output"]["training_metrics"]

      if valid:
        return self._model_json["output"]["validation_metrics"]

    else:  # cases dealing with test_data not None
      if not isinstance(test_data, H2OFrame):
        raise ValueError("`test_data` must be of type H2OFrame.  Got: " + type(test_data))
      fr_key = H2OFrame.send_frame(test_data)
      res = H2OConnection.post_json("ModelMetrics/models/" + self._key + "/frames/" + fr_key)
      h2o.remove(fr_key)

      # FIXME need to do the client-side filtering...  PUBDEV-874:   https://0xdata.atlassian.net/browse/PUBDEV-874
      raw_metrics = None
      for mm in res["model_metrics"]:
        if mm["frame"]["name"] == fr_key:
          raw_metrics = mm
          break
      return self._metrics_class(raw_metrics,algo=self._model_json["algo"])
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_airline_05_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/air05_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/air05_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            startcsv = time.time()
            multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'])
            endcsv = time.time()

            csv_type_dict = multi_file_csv.types

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            col_ind_name = dict()
            # change column types from real to enum according to multi_file_csv column types
            for key_name in list(csv_type_dict):
                col_ind = key_name.split('C')
                new_ind = int(str(col_ind[1]))-1
                col_ind_name[new_ind] = key_name

            col_types = []
            for ind in range(len(col_ind_name)):
                col_types.append(csv_type_dict[col_ind_name[ind]])

            startorc1 = time.time()
            multi_file_orc1 = h2o.import_file(url_orc)
            endorc1 = time.time()
            h2o.remove(multi_file_orc1)

            startorc = time.time()
            multi_file_orc = h2o.import_file(url_orc,col_types=col_types)
            endorc = time.time()

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            print("************** CSV parse time is {0}".format(endcsv-startcsv))
            print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1))
            print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc))
            # compare frame read by orc by forcing column type,
            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def test_in():
    iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv"))

    assert 5.1 in iris[0], "expected 5.1 to be in the first column, but it wasn't"
    assert 1.7 in iris, "expected 1.7 to be in the dataset, but it wasn't"
    assert not 99 in iris, "didn't expect 99 to be in the dataset, but it was"
    assert "Iris-setosa" in iris[4], "expected Iris-setosa to be in the dataset, but it wasn't"

    h2o.remove(iris)
def _model_build(x,y,validation_x,validation_y,algo_url,kwargs):
  # Basic sanity checking
  if not x:  raise ValueError("Missing features")
  x = _check_frame(x,y,y)
  if validation_x:
    validation_x = _check_frame(validation_x,validation_y,y)

  # Send frame descriptions to H2O cluster
  train_key = x.send_frame()
  kwargs['training_frame']=train_key
  if validation_x:
    valid_key = validation_x.send_frame()
    kwargs['validation_frame']=valid_key

  if y:
    kwargs['response_column']=y._name

  kwargs = dict([(k, kwargs[k]) for k in kwargs if kwargs[k] is not None])

  # launch the job and poll
  job = H2OJob(H2OConnection.post_json("ModelBuilders/"+algo_url, **kwargs), job_type=(algo_url+" Model Build")).poll()
  model_json = H2OConnection.get_json("Models/"+job.dest_key)["models"][0]
  model_type = model_json["output"]["model_category"]
  if model_type=="Binomial":
    from model.binomial import H2OBinomialModel
    model = H2OBinomialModel(job.dest_key,model_json)

  elif model_type=="Clustering":
    from model.clustering import H2OClusteringModel
    model = H2OClusteringModel(job.dest_key,model_json)

  elif model_type=="Regression":
    from model.regression import H2ORegressionModel
    model = H2ORegressionModel(job.dest_key,model_json)

  elif model_type=="Multinomial":
    from model.multinomial import H2OMultinomialModel
    model = H2OMultinomialModel(job.dest_key,model_json)

  elif model_type=="AutoEncoder":
    from model.autoencoder import H2OAutoEncoderModel
    model = H2OAutoEncoderModel(job.dest_key,model_json)

  else:
    print model_type
    raise NotImplementedError

  # Cleanup
  h2o.remove(train_key)
  if validation_x:
    h2o.remove(valid_key)

  return model
Example #14
0
def test_prod():

    data = [[random.uniform(1,10)] for c in range(10)]
    h2o_data = h2o.H2OFrame(list(zip(*data)))
    np_data = np.array(data)

    h2o_prod = h2o_data.prod()
    np_prod = np.prod(np_data)

    assert abs(h2o_prod - np_prod) < 1e-06, "check unsuccessful! h2o computed {0} and numpy computed {1}. expected " \
                                            "equal quantile values between h2o and numpy".format(h2o_prod,np_prod)
    h2o.remove(h2o_data)
Example #15
0
def h2oremove():
    """
    Python API test: h2o.remove(x)
    """
    # call with no arguments
    try:
        training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
        assert_is_type(training_data, H2OFrame)
        h2o.remove(training_data)
        training_data.nrow  # this command should generate an error since training_data should have been deleted.
    except Exception as e:
        assert_is_type(e, AttributeError)
        assert "object has no attribute" in e.args[0], "h2o.remove() command is not working."
def models_stress_test():
    data = h2o.import_file(pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    # Ulimit of stress tests should be set low enough, e.g. 100, in case of file descriptors leaks, this test should fail.
    num_models = 1000
    start = timer()

    for i in range(0,num_models):
        xgb = H2OXGBoostEstimator(ntrees = 1, max_depth= 2)
        xgb.train(x = ["Origin","Distance"],y="IsDepDelayed", training_frame=data)
        h2o.remove(xgb)

    end = timer()
    print ('Trained {} models in {} seconds.'.format(num_models, end - start))
Example #17
0
 def model_performance(self, test_data):
   """
   Generate model metrics for this model on test_data.
   :param test_data: Data set for which model metrics shall be computed against.
   :return: An object of class H2OModelMetrics.
   """
   if not test_data:  raise ValueError("Missing`test_data`.")
   if not isinstance(test_data, H2OFrame):
     raise ValueError("`test_data` must be of type H2OFrame.  Got: " + type(test_data))
   fr_key = H2OFrame.send_frame(test_data)
   res = H2OConnection.post_json("ModelMetrics/models/" + self._key + "/frames/" + fr_key)
   h2o.remove(fr_key)
   raw_metrics = res["model_metrics"][0]
   return self._metrics_class(raw_metrics)
def pca_car():
  num_runs = 10
  run_time_c = []

  car = h2o.import_file(path=pyunit_utils.locate("smalldata/pca_test/car.arff.txt"))  # Nidhi: import may not work
  for run_index in range(num_runs):  # multiple runs to get an idea of run time info
    carPCA = H2OPCA(k=car.ncols, transform="STANDARDIZE")
    carPCA.train(x=list(range(0, car.ncols)), training_frame=car)
    run_time_c.append(carPCA._model_json['output']['end_time']-carPCA._model_json['output']['start_time'])
    print("PCA model training time with car.arff.txt data in ms is {0}".format(run_time_c[run_index]))

    h2o.remove(carPCA)

  assert (max(run_time_c)) < 60000, "PCA runs for car.arff.txt take too much time!"
Example #19
0
 def __del__(self):
   # Dead pending op or local data; nothing to delete
   if self.is_pending() or self.is_local(): return
   assert self.is_remote(), "Data wasn't remote. Hrm..."
   global __CMD__
   if __CMD__ is None:
     h2o.remove(self._data)
   else:
     s = " (del '" + self._data + "' #0)"
     global __TMPS__
     if __TMPS__ is None:
       print "Lost deletes: ", s
     else:
       __TMPS__ += s
def glm_solvers():
    predictors = ["displacement","power","weight","acceleration","year"]

    for solver in ["AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT"]:
        print("Solver = {0}".format(solver))
        for family in ["binomial", "gaussian", "poisson", "tweedie", "gamma"]:
            if   family == 'binomial': response_col = "economy_20mpg"
            elif family == 'gaussian': response_col = "economy"
            else:                      response_col = "cylinders"
            print("Family = {0}".format(family))
            training_data = h2o.import_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
            if   family == 'binomial': training_data[response_col] = training_data[response_col].asfactor()
            else:                      training_data[response_col] = training_data[response_col].asnumeric()
            model = h2o.glm(x=training_data[predictors], y=training_data[response_col], family=family, alpha=[0], Lambda=[1e-5], solver=solver)
            h2o.remove(training_data)
def glrm_prostate_miss():
    missing_ratios = np.arange(0.1, 1, 0.1).tolist()
    
    print("Importing prostate_cat.csv data and saving for validation...")
    prostate_full = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA"]*8)
    prostate_full.describe()
    totnas = 0
    for i in range(prostate_full.ncol):
        totnas = totnas + prostate_full[i].isna().sum()
    totobs = prostate_full.nrow * prostate_full.ncol - totnas
    
    train_numerr = [0]*len(missing_ratios)
    valid_numerr = [0]*len(missing_ratios)
    train_caterr = [0]*len(missing_ratios)
    valid_caterr = [0]*len(missing_ratios)
    
    for i in range(len(missing_ratios)):
        ratio = missing_ratios[i]
        print("Importing prostate_cat.csv and inserting {0}% missing entries".format(100*ratio))
        prostate_miss = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
        prostate_miss = prostate_miss.insert_missing_values(fraction=ratio)
        prostate_miss.describe()
        
        print("H2O GLRM with {0}% missing entries".format(100*ratio))
        prostate_glrm = h2o.glrm(x=prostate_miss, validation_frame=prostate_full, k=8, ignore_const_cols=False, loss="Quadratic", gamma_x=0.5, gamma_y=0.5, regularization_x="L1", regularization_y="L1", init="SVD", max_iterations=2000, min_step_size=1e-6)
        prostate_glrm.show()
        
        # Check imputed data and error metrics
        train_numcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['numcnt']
        valid_numcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numcnt']
        train_catcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['catcnt']
        valid_catcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['catcnt']
        assert valid_numcnt >= train_numcnt, "Number of non-missing numeric entries in training data should be less than or equal to validation data"
        assert valid_catcnt >= train_catcnt, "Number of non-missing categorical entries in training data should be less than or equal to validation data"
        assert (train_numcnt + valid_numcnt) < totobs, "Total non-missing numeric entries in training and validation data was {0}, but should be less than {1}".format(train_numcnt + valid_numcnt, totobs)
        assert (valid_numcnt + valid_catcnt) == totobs, "Number of non-missing entries in validation data was {0}, but should be {1}".format(valid_numcnt + valid_catcnt, totobs)

        train_numerr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['numerr']
        valid_numerr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numerr']
        train_caterr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['caterr']
        valid_caterr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['caterr']
        h2o.remove(prostate_glrm._model_json['output']['representation_name'])
    
    for i in range(len(missing_ratios)):
        print("Missing ratio: {0}% --> Training numeric error: {1}\tValidation numeric error: {2}".format(missing_ratios[i]*100, train_numerr[i], valid_numerr[i]))
        
    for i in range(len(missing_ratios)):
        print("Missing ratio: {0}% --> Training categorical error: {1}\tValidation categorical error: {2}".format(missing_ratios[i]*100, train_caterr[i], valid_caterr[i]))
Example #22
0
File: expr.py Project: hdfeos/h2o-3
 def __del__(self):
   # Dead pending op or local data; nothing to delete
   if self.is_pending() or self.is_local(): return
   assert self.is_remote(), "Data wasn't remote. Hrm..."
   global __CMD__
   if __CMD__ is None:
     if h2o is not None:
       h2o.remove(self._data)
   else:
     # Hard/deep remove of a Vec, built into a rapids expression
     s = " (del '" + self._data + "')"
     global __TMPS__
     if __TMPS__ is None:
       print "Lost deletes: ", s
     else:
       __TMPS__ += s
Example #23
0
  def head(self, rows=10, cols=200, **kwargs):
    nrows = min(self.nrow(), rows)
    ncols = min(self.ncol(), cols)
    colnames = self.names()[0:ncols]

    fr = H2OFrame.py_tmp_key()
    cbind = "(= !" + fr + " (cbind %"
    cbind += " %".join([vec._expr.eager() for vec in self]) + "))"
    res = h2o.rapids(cbind)
    h2o.remove(fr)
    head_rows = [range(1, nrows + 1, 1)]
    head_rows += [rows[0:nrows] for rows in res["head"][0:ncols]]
    head = zip(*head_rows)
    print "First", str(nrows), "rows and first", str(ncols), "columns: "
    print tabulate.tabulate(head, headers=["Row ID"] + colnames)
    print
def import_folder():
    """
    This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv
    from and build another H2O frame from the multi-file orc parser using multiple orc files that are
    saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc.  It will compare the two frames
    to make sure they are equal.
    :return: None if passed.  Otherwise, an exception will be thrown.
    """
    startcsv = time.time()
    multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_csv"),
                                     na_strings=['\\N'])
    endcsv = time.time()

    csv_type_dict = multi_file_csv.types

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

    col_ind_name = dict()
    # change column types from real to enum according to multi_file_csv column types
    for key_name in list(csv_type_dict):
        col_ind = key_name.split('C')
        new_ind = int(str(col_ind[1]))-1
        col_ind_name[new_ind] = key_name

    col_types = []
    for ind in range(len(col_ind_name)):
        col_types.append(csv_type_dict[col_ind_name[ind]])

    startorc1 = time.time()
    multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc"))
    endorc1 = time.time()
    h2o.remove(multi_file_orc1)

    startorc = time.time()
    multi_file_orc = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc"),
                                     col_types=col_types)
    endorc = time.time()

    multi_file_orc.summary()
    orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

    print("************** CSV parse time is {0}".format(endcsv-startcsv))
    print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1))
    print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc))
    # compare frame read by orc by forcing column type,
    pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
Example #25
0
 def var(self):
   """
   :return: The covariance matrix of the columns in this H2OFrame.
   """
   key = self.send_frame()
   tmp_key = H2OFrame.py_tmp_key()
   expr = "(= !{} (var %{} \"null\" %FALSE \"everything\"))".format(tmp_key,key)
   h2o.rapids(expr)
   # Remove h2o temp frame after var
   h2o.remove(key)
   j = h2o.frame(tmp_key)
   fr = j['frames'][0]
   rows = fr['rows']
   veckeys = fr['vec_keys']
   cols = fr['columns']
   colnames = [col['label'] for col in cols]
   return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
Example #26
0
  def eager(self):
    """
    This forces a top-level execution, as needed, and produces a top-level result
    locally. Frames are returned and truncated to the standard preview response
    provided by rapids - 100 rows X 200 cols.

    :return: A key pointing to the big data object
    """
    if self.is_computed(): return self._data
    # Gather the computation path for remote work, or doit locally for local work
    global __CMD__, __TMPS__
    assert not __CMD__ and not __TMPS__
    __CMD__ = ""
    __TMPS__ = ""    # Begin gathering rapids commands
    dummy = self     # Force extra refcnt so we get a top-level assignment in do_it
    self._do_it()    # Symbolically execute the command
    cmd = __CMD__
    tmps = __TMPS__  # Stop  gathering rapids commands
    __CMD__ = None
    __TMPS__ = None
    if self.is_local():  return self._data  # Local computation, all done

    # Remote computation - ship Rapids over wire, assigning key to result
    if tmps:
      cmd = "(, " + cmd + tmps + ")"
    j = h2o.rapids(cmd)
    if j['result_type'] == 0:
      pass  # Big Data Key is the result
    # Small data result pulled locally
    elif j['num_rows']:   # basically checks if num_rows is nonzero... sketchy.
      self._data = j['head']
    elif j['result'] in [u'TRUE', u'FALSE']:
      self._data = (j['result'] == u'TRUE')
    elif j['result_type'] in [1,2,3,4]:
      if isinstance(j['string'], str):
        self._data = j['string']
      if isinstance(j['string'], unicode):
        self._data = j['string'].encode('utf-8')
      else:
        if not hasattr(j['scalar'], '__len__'): self._data = j['scalar']

    if j['result_type'] in [3,4]:
      for key in j['vec_ids']:
        h2o.remove(key['name'])

    return self._data
def h2o_H2OFrame_prod():
    """
    Python API test: h2o.frame.H2OFrame.prod(na_rm=False)

    Copied from pyunit_prod.py
    """
    data = [[random.uniform(1,10)] for c in range(10)]
    h2o_data = h2o.H2OFrame(data)
    np_data = np.array(data)

    h2o_prod = h2o_data.prod(na_rm=True)
    np_prod = np.prod(np_data)

    assert abs(h2o_prod - np_prod) < 1e-06, "check unsuccessful! h2o computed {0} and numpy computed {1}. expected " \
                                            "equal quantile values between h2o and numpy".format(h2o_prod,np_prod)

    h2o.remove(h2o_data)
Example #28
0
  def group_by(self,cols,a):
    """
    GroupBy
    :param cols: The columns to group on.
    :param a: A dictionary of aggregates having the following shape: \
    {"colname":[aggregate, column, naMethod]}\
    e.g.: {"bikes":["count", 0, "all"]}\

    The naMethod is one of "all", "ignore", or "rm", which specifies how to handle
    NAs that appear in columns that are being aggregated.

    "all" - include NAs
    "rm"  - exclude NAs
    "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.)
    :return: The group by frame.
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")"
    aggregates = copy.deepcopy(a)
    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()

    aggs = []

    # transform cols in aggregates to their indices...
    for k in aggregates:
      if isinstance(aggregates[k][1],str):
        aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1]))
      else:
        aggregates[k][1] = '#'+str(aggregates[k][1])
      aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])]
    aggs = "(agg {})".format(" ".join(aggs))

    expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs)
    h2o.rapids(expr)  # group by
    # Remove h2o temp frame after groupby
    h2o.remove(key)
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key)
    fr = j['frames'][0]       # Just the first (only) frame
    rows = fr['rows']         # Row count
    veckeys = fr['vec_ids']  # List of h2o vec keys
    cols = fr['columns']      # List of columns
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
Example #29
0
def remove_obj_client():

  training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
  
  Y = 3
  X = range(3) + range(4,11)
  
  from h2o.estimators.glm import H2OGeneralizedLinearEstimator
  model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5)
  print model.model_id
  print model
  model.train(x=X,y=Y, training_frame=training_data)
  print model
  h2o.remove(model)
  print model
  
  h2o.remove(training_data)
  print training_data
def h2o_H2OFrame_all():
    """
    Python API test: h2o.frame.H2OFrame.all(), h2o.frame.H2OFrame.any()
    """
    python_lists=[[True, False], [False, True], [True, True], [True, 'NA']]
    h2oframe = h2o.H2OFrame(python_obj=python_lists, na_strings=['NA']) # contains true and false
    assert not(h2oframe.all()), "h2o.H2OFrame.all() command is not working." # all elements are true or NA
    assert h2oframe.any(), "h2o.H2OFrame.any() command is not working." # all elements are true or NA
    h2o.remove(h2oframe)
    python_lists=[[True, True], [True, True], [True, True], [True, 'NA']]   # check with one boolean level only
    h2oframe = h2o.H2OFrame(python_obj=python_lists, na_strings=['NA']) # contains true and false
    assert h2oframe.all(), "h2o.H2OFrame.all() command is not working." # all elements are true or NA
    assert h2oframe.any(), "h2o.H2OFrame.any() command is not working." # all elements are true or NA
    h2o.remove(h2oframe)
    python_lists=[[False, False], [False, False], [False, False], [False, 'NA']] # check with one boolean level only
    h2oframe = h2o.H2OFrame(python_obj=python_lists, na_strings=['NA']) # contains true and false
    assert not(h2oframe.all()), "h2o.H2OFrame.all() command is not working." # all elements are false or NA
    assert h2oframe.any(), "h2o.H2OFrame.any() command is not working." # all elements are true or NA
Example #31
0
def pubdev_6603():
    hf = h2o.H2OFrame(pd.DataFrame([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]))
    s1, s2 = hf.split_frame(ratios=[0.5], seed=1)
    h2o.remove([hf, s1, s2])
    assert len(h2o.ls()) == 0
Example #32
0
def run(dataset, config):
    log.info(f"\n**** H2O AutoML [v{h2o.__version__}] ****\n")
    save_metadata(config, version=h2o.__version__)
    # Mapping of benchmark metrics to H2O metrics
    metrics_mapping = dict(acc='mean_per_class_error',
                           auc='AUC',
                           logloss='logloss',
                           mae='mae',
                           mse='mse',
                           r2='r2',
                           rmse='rmse',
                           rmsle='rmsle')
    sort_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if sort_metric is None:
        # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported, defaulting to AUTO.",
                    config.metric)

    try:
        training_params = {
            k: v
            for k, v in config.framework_params.items()
            if not k.startswith('_')
        }
        nthreads = config.framework_params.get('_nthreads', config.cores)
        jvm_memory = str(
            round(config.max_mem_size_mb * 2 /
                  3)) + "M"  # leaving 1/3rd of available memory for XGBoost

        log.info("Starting H2O cluster with %s cores, %s memory.", nthreads,
                 jvm_memory)
        max_port_range = 49151
        min_port_range = 1024
        rnd_port = os.getpid() % (max_port_range -
                                  min_port_range) + min_port_range
        port = config.framework_params.get('_port', rnd_port)

        init_params = config.framework_params.get('_init', {})
        if "logs" in config.framework_params.get('_save_artifacts', []):
            init_params['ice_root'] = output_subdir("logs", config)

        h2o.init(nthreads=nthreads,
                 port=port,
                 min_mem_size=jvm_memory,
                 max_mem_size=jvm_memory,
                 **init_params)

        import_kwargs = {}
        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = None
        if version.parse(h2o.__version__) >= version.parse(
                "3.32.0.3"
        ):  # previous versions may fail to parse correctly some rare arff files using single quotes as enum/string delimiters (pandas also fails on same datasets)
            import_kwargs['quotechar'] = '"'
            train = h2o.import_file(dataset.train.path,
                                    destination_frame=frame_name(
                                        'train', config),
                                    **import_kwargs)
            if train.nlevels() != dataset.domains.cardinalities:
                h2o.remove(train)
                train = None
                import_kwargs['quotechar'] = "'"

        if not train:
            train = h2o.import_file(dataset.train.path,
                                    destination_frame=frame_name(
                                        'train', config),
                                    **import_kwargs)
            # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path,
                               destination_frame=frame_name('test', config),
                               **import_kwargs)
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name,
                 config.fold)
        log.debug(
            "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
            config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds,
                        sort_metric=sort_metric,
                        seed=config.seed,
                        **training_params)

        monitor = (
            BackendMemoryMonitoring(
                frequency_seconds=config.ext.monitoring.frequency_seconds,
                check_on_exit=True,
                verbosity=config.ext.monitoring.verbosity)
            if config.framework_params.get('_monitor_backend', False)
            # else contextlib.nullcontext  # Py 3.7+ only
            else contextlib.contextmanager(lambda: (_ for _ in (0, )))())
        with utils.Timer() as training:
            with monitor:
                aml.train(y=dataset.target.index, training_frame=train)

        if not aml.leader:
            raise FrameworkError(
                "H2O could not produce any model in the requested time.")

        with utils.Timer() as predict:
            preds = aml.predict(test)

        preds = extract_preds(preds, test, dataset=dataset)
        save_artifacts(aml, dataset=dataset, config=config)

        return result(output_file=config.output_predictions_file,
                      predictions=preds.predictions,
                      truth=preds.truth,
                      probabilities=preds.probabilities,
                      probabilities_labels=preds.probabilities_labels,
                      models_count=len(aml.leaderboard),
                      training_duration=training.duration,
                      predict_duration=predict.duration)

    finally:
        con = h2o.connection()
        if con:
            # h2o.remove_all()
            con.close()
            if con.local_server:
                con.local_server.shutdown()