Python rapids Examples, h2o.rapids Python Examples

Example #1

0

Show file

File: frame.py Project: VenkatForkedRepos/h2o-dev

  def cbind(self,data):
    """
    :param data: H2OFrame or H2OVec
    :return: new H2OFrame with data cbinded to the end
    """
    # Check data type
    vecs = []
    if isinstance(data,H2OFrame):
      vecs.append(self)
      [vecs.append(vec) for vec in data._vecs]
    elif isinstance(data,H2OVec):
      vecs = [self, data]
    else:
      raise ValueError("data parameter must be H2OVec or H2OFrame")
    names = [vec.name() for vec in vecs]

    fr = H2OFrame.py_tmp_key()
    cbind = "(= !" + fr + " (cbind %"
    cbind += " %".join([vec._expr.eager() for vec in vecs]) + "))"
    h2o.rapids(cbind)

    j = h2o.frame(fr)
    fr = j['frames'][0]
    rows = fr['rows']
    veckeys = fr['vec_ids']
    cols = fr['columns']
    colnames = [col['label'] for col in cols]
    result = H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
    result.setNames(names)
    return result

Example #2

0

Show file

def gbm_reweight_tree():
    prostate_frame = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate_frame["RACE"] = prostate_frame["RACE"].asfactor()
    prostate_frame["CAPSULE"] = prostate_frame["CAPSULE"].asfactor()

    x = ["AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "DPROS"]
    y = 'CAPSULE'

    gbm_model = H2OGradientBoostingEstimator()
    gbm_model.train(x=x, y=y, training_frame=prostate_frame)

    # 1. Get original contributions
    contribs_original = gbm_model.predict_contributions(prostate_frame)
    assert contribs_original.col_names == [
        u'AGE', u'RACE', u'DPROS', u'DCAPS', u'PSA', u'VOL', u'GLEASON',
        u'BiasTerm'
    ]

    # 2. Scale weights => contributions should stay the same
    prostate_frame["weights"] = 2
    h2o.rapids('(sharedtree.update.weights {} {} "{}")'.format(
        gbm_model.model_id, prostate_frame.frame_id, "weights"))
    contribs_reweighted = gbm_model.predict_contributions(prostate_frame)
    assert_frame_equal(contribs_reweighted.as_data_frame(),
                       contribs_original.as_data_frame())

    # 3. Reweight based on small subset of the data => contributions are expected to change
    prostate_subset = prostate_frame.head(10)
    h2o.rapids('(sharedtree.update.weights {} {} "{}")'.format(
        gbm_model.model_id, prostate_subset.frame_id, "weights"))
    contribs_subset = gbm_model.predict_contributions(prostate_subset)
    assert contribs_subset["BiasTerm"].min(
    ) != contribs_original["BiasTerm"].min()

Example #3

0

Show file

File: frame.py Project: VenkatForkedRepos/h2o-dev

  def ddply(self,cols,fun):
    """
    :param cols: Column names used to control grouping
    :param fun: Function to execute on each group.  Right now limited to textual Rapids expression
    :return: New frame with 1 row per-group, of results from 'fun'
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    # Confirm all names present in dataset; collect column indices
    rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")"

    # Eagerly eval and send the cbind'd frame over
    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()
    expr = "(= !{} (h2o.ddply %{} {} {}))".format(tmp_key,key,rapids_series,fun)
    h2o.rapids(expr) # ddply in h2o
    # Remove h2o temp frame after ddply
    h2o.remove(key)
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key) # Fetch the frame as JSON
    fr = j['frames'][0]    # Just the first (only) frame
    rows = fr['rows']      # Row count
    veckeys = fr['vec_ids']# List of h2o vec keys
    cols = fr['columns']   # List of columns
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))

Example #4

0

Show file

def test_workaround_for_distribution():
    try:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format(
            "sys.ai.h2o.automl.algo_parameters.all.enabled", "true"))
        ds = import_dataset('regression')
        aml = H2OAutoML(project_name="py_test",
                        algo_parameters=dict(
                            distribution='poisson',
                            family='poisson',
                        ),
                        exclude_algos=['StackedEnsemble'],
                        max_runtime_secs=60,
                        seed=1)
        aml.train(y=ds.target, training_frame=ds.train)
        model_names = [
            aml.leaderboard[i, 0] for i in range(0, (aml.leaderboard.nrows))
        ]
        for mn in model_names:
            m = h2o.get_model(mn)
            dist = m.params[
                'distribution'] if 'distribution' in m.params else m.params[
                    'family'] if 'family' in m.params else None
            print("{}: distribution = {}".format(mn, dist))
    except:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format(
            "sys.ai.h2o.automl.algo_parameters.all.enabled", "false"))

Example #5

0

Show file

File: frame.py Project: VenkatForkedRepos/h2o-dev

  def quantile(self, prob=None, combine_method="interpolate"):
    """
    Compute quantiles over a given H2OFrame.

    :param prob: A list of probabilties, default is [0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]. You may provide any sequence of any length.
    :param combine_method: For even samples, how to combine quantiles. Should be one of ["interpolate", "average", "low", "hi"]
    :return: an H2OFrame containing the quantiles and probabilities.
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    if len(self) == 0: return self
    if not prob: prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]
    if not isinstance(prob, list): raise ValueError("prob must be a list")
    probs = "(dlist #"+" #".join([str(p) for p in prob])+")"
    if combine_method not in ["interpolate","average","low","high"]:
      raise ValueError("combine_method must be one of: [" + ",".join(["interpolate","average","low","high"])+"]")

    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()
    expr = "(= !{} (quantile '{}' {} '{}'".format(tmp_key,key,probs,combine_method)
    h2o.rapids(expr)
    # Remove h2o temp frame after groupby
    h2o.remove(key)
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key)
    fr = j['frames'][0]       # Just the first (only) frame
    rows = fr['rows']         # Row count
    veckeys = fr['vec_ids']  # List of h2o vec keys
    cols = fr['columns']      # List of columns
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))

Example #6

0

Show file

File: pyunit_effective_parameters_isofor.py Project: zoudongyang/h2o-3

def test_isolation_forrest_effective_parameters():
    train2 = h2o.import_file(
        pyunit_utils.locate("smalldata/anomaly/ecg_discord_train.csv"))

    if1 = H2OIsolationForestEstimator(ntrees=7,
                                      seed=12,
                                      sample_size=5,
                                      stopping_rounds=3,
                                      score_each_iteration=True)
    if1.train(training_frame=train2)

    if2 = H2OIsolationForestEstimator(ntrees=7,
                                      seed=12,
                                      sample_size=5,
                                      stopping_rounds=3,
                                      stopping_metric='anomaly_score',
                                      categorical_encoding="Enum",
                                      score_each_iteration=True)
    if2.train(training_frame=train2)

    assert if1.parms['stopping_metric']['input_value'] == 'AUTO'
    assert if1.parms['stopping_metric']['actual_value'] == if2.parms[
        'stopping_metric']['actual_value']
    assert if1._model_json['output']['training_metrics']._metric_json[
        'mean_score'] == if2._model_json['output'][
            'training_metrics']._metric_json['mean_score']
    assert if1.parms['categorical_encoding']['input_value'] == 'AUTO'
    assert if1.parms['categorical_encoding']['actual_value'] == if2.parms[
        'categorical_encoding']['actual_value']

    try:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format(
            "sys.ai.h2o.algos.evaluate_auto_model_parameters", "false"))
        if1 = H2OIsolationForestEstimator(ntrees=7,
                                          seed=12,
                                          sample_size=5,
                                          stopping_rounds=3,
                                          score_each_iteration=True)
        if1.train(training_frame=train2)

        if2 = H2OIsolationForestEstimator(ntrees=7,
                                          seed=12,
                                          sample_size=5,
                                          stopping_rounds=3,
                                          stopping_metric='anomaly_score',
                                          categorical_encoding="Enum",
                                          score_each_iteration=True)
        if2.train(training_frame=train2)

        assert if1.parms['stopping_metric']['input_value'] == 'AUTO'
        assert if1.parms['stopping_metric']['actual_value'] == 'AUTO'
        assert if1._model_json['output']['training_metrics']._metric_json[
            'mean_score'] == if2._model_json['output'][
                'training_metrics']._metric_json['mean_score']
        assert if1.parms['categorical_encoding']['input_value'] == 'AUTO'
        assert if1.parms['categorical_encoding']['actual_value'] == 'AUTO'
    finally:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format(
            "sys.ai.h2o.algos.evaluate_auto_model_parameters", "true"))

Example #7

0

Show file

File: frame.py Project: Vishnu24/h2o-3

 def frame_id(self, value):
   oldname = self.frame_id
   keep    = self._ast is None
   if keep:
     h2o.assign(self,value)
   else:
     self._id = value
     h2o.rapids("(rename \"{}\" \"{}\")".format(oldname, value))

Example #8

0

Show file

def pubdev_4863():

    try:
        h2o.rapids("(tmp= digi_temp (cols_py 123STARTSWITHDIGITS 'a'))")
        assert False
    except H2OResponseError as error:
        print(error)
        assert 'Error: Name lookup of \'123STARTSWITHDIGITS\' failed' in str(
            error)

Example #9

0

Show file

File: pyunit_xgboost_reweight_tree.py Project: stjordanis/h2o-3

def xgboost_reweight_tree():
    prostate_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate_frame["RACE"] = prostate_frame["RACE"].asfactor()
    prostate_frame["CAPSULE"] = prostate_frame["CAPSULE"].asfactor()

    x = ["AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "DPROS"]
    y = 'CAPSULE'

    xgb_model = H2OXGBoostEstimator()
    xgb_model.train(x=x, y=y, training_frame=prostate_frame)

    # 0. Save original MOJO
    oring_mojo_path = xgb_model.download_mojo()
    orig_mojo_str = h2o.print_mojo(oring_mojo_path)

    # 1. Get original contributions
    contribs_original = xgb_model.predict_contributions(prostate_frame)
    assert contribs_original.col_names == [
        u'RACE.0', u'RACE.1', u'RACE.2', u'RACE.missing(NA)', u'AGE', u'DPROS', u'DCAPS', u'PSA', u'VOL', u'GLEASON', 
        u'BiasTerm'
    ]

    # 2. Scale weights => contributions should stay the same
    weights_scale = 2
    prostate_frame["weights"] = weights_scale
    h2o.rapids('(tree.update.weights {} {} "{}")'.format(xgb_model.model_id, prostate_frame.frame_id, "weights"))
    contribs_reweighted = xgb_model.predict_contributions(prostate_frame)
    assert_frame_equal(contribs_reweighted.as_data_frame(), contribs_original.as_data_frame(), check_less_precise=3)

    # 3. Reweight based on small subset of the data => contributions are expected to change
    prostate_subset = prostate_frame.head(10)
    h2o.rapids('(tree.update.weights {} {} "{}")'.format(xgb_model.model_id, prostate_subset.frame_id, "weights"))
    contribs_subset = xgb_model.predict_contributions(prostate_subset)
    assert contribs_subset["BiasTerm"].min() != contribs_original["BiasTerm"].min()

    # 4. Save modified mojo
    reweighted_mojo_path = xgb_model.download_mojo()
    reweighted_mojo_str = h2o.print_mojo(reweighted_mojo_path)

    # Sanity check
    assert orig_mojo_str != reweighted_mojo_str

    # Check first tree weight
    init_f = 1 / (1 + math.exp(0))
    hess_coef = init_f * (1 - init_f)
    orig_trees = json.loads(orig_mojo_str)
    assert orig_trees["trees"][0]["root"]["weight"] == prostate_frame.nrow * hess_coef
    
    reweighted_trees = json.loads(reweighted_mojo_str)
    assert reweighted_trees["trees"][0]["root"]["weight"] == prostate_subset.nrow * hess_coef * weights_scale

Example #10

0

Show file

File: frame.py Project: OspreyX/h2o-dev

  def group_by(self,cols,a):
    """
    GroupBy
    :param cols: The columns to group on.
    :param a: A dictionary of aggregates having the following shape: \
    {"colname":[aggregate, column, naMethod]}\
    e.g.: {"bikes":["count", 0, "all"]}\

    The naMethod is one of "all", "ignore", or "rm", which specifies how to handle
    NAs that appear in columns that are being aggregated.

    "all" - include NAs
    "rm"  - exclude NAs
    "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.)
    :return: The group by frame.
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")"
    aggregates = copy.deepcopy(a)
    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()

    aggs = []

    # transform cols in aggregates to their indices...
    for k in aggregates:
      if isinstance(aggregates[k][1],str):
        aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1]))
      else:
        aggregates[k][1] = '#'+str(aggregates[k][1])
      aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])]
    aggs = "(agg {})".format(" ".join(aggs))

    expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs)
    h2o.rapids(expr)  # group by
    # Remove h2o temp frame after groupby
    h2o.delete(key)
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key)
    fr = j['frames'][0]       # Just the first (only) frame
    rows = fr['rows']         # Row count
    veckeys = fr['vec_ids']  # List of h2o vec keys
    cols = fr['columns']      # List of columns
    colnames = [col['label'] for col in cols]
    vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows) # Peel the Vecs out of the returned Frame
    h2o.delete(tmp_key)
    return H2OFrame(vecs=vecs)

Example #11

0

Show file

File: frame.py Project: vikasgoel2000/h2o-dev

 def quantile(self, prob=None):
   if len(self) == 0: return self
   if not prob: prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]
   if not isinstance(prob, list): raise ValueError("prob must be a list")
   probs = "(dlist #"+" #".join([str(p) for p in prob])+")"
   key = self.send_frame()
   tmp_key = H2OFrame.py_tmp_key()
   expr = "(= !{} (quantile '{}' {}".format(tmp_key, key, probs)
   h2o.rapids(expr)
   j = h2o.frame(tmp_key)
   fr = j['frames'][0]       # Just the first (only) frame
   rows = fr['rows']         # Row count
   veckeys = fr['vec_keys']  # List of h2o vec keys
   cols = fr['columns']      # List of columns
   colnames = [col['label'] for col in cols]
   return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))

Example #12

0

Show file

File: frame.py Project: VenkatForkedRepos/h2o-dev

  def head(self, rows=10, cols=200, **kwargs):
    """
    Analgous to R's `head` call on a data.frame. Display a digestible chunk of the H2OFrame starting from the beginning.

    :param rows: Number of rows to display.
    :param cols: Number of columns to display.
    :param kwargs: Extra arguments passed from other methods.
    :return: None
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    nrows = min(self.nrow(), rows)
    ncols = min(self.ncol(), cols)
    colnames = self.names()[0:ncols]

    fr = H2OFrame.py_tmp_key()
    cbind = "(= !" + fr + " (cbind %"
    cbind += " %".join([vec._expr.eager() for vec in self]) + "))"
    res = h2o.rapids(cbind)
    h2o.remove(fr)
    head_rows = [range(1, nrows + 1, 1)]
    head_rows += [rows[0:nrows] for rows in res["head"][0:ncols]]
    head = zip(*head_rows)
    print "First", str(nrows), "rows and first", str(ncols), "columns: "
    print tabulate.tabulate(head, headers=["Row ID"] + colnames)
    print

Example #13

0

Show file

File: frame.py Project: darraghdog/h2o-dev

 def send_frame(self):
   """
   Send a frame description to H2O, returns a key.
   :return: A key
   """
   # Send over the frame
   fr = H2OFrame.py_tmp_key()
   cbind = "(= !" + fr + " (cbind '"
   cbind += "' '".join([vec._expr.eager() for vec in self._vecs]) + "'))"
   h2o.rapids(cbind)
   # And frame columns
   colnames = "(colnames= %" + fr + " {(: #0 #" + str(len(self) - 1) + ")} {"
   cnames = ';'.join([vec._name for vec in self._vecs])
   colnames += cnames + "})"
   h2o.rapids(colnames)
   return fr

Example #14

0

Show file

File: expr.py Project: StephaneFeniar/h2o-dev

  def eager(self):
    """
    This forces a top-level execution, as needed, and produces a top-level result
    locally. Frames are returned and truncated to the standard preview response
    provided by rapids - 100 rows X 200 cols.
    :return: A key pointing to the big data object
    """
    if self.is_computed(): return self._data
    # Gather the computation path for remote work, or doit locally for local work
    global __CMD__, __TMPS__
    assert not __CMD__ and not __TMPS__
    __CMD__ = ""
    __TMPS__ = ""  # Begin gathering rapids commands
    self._do_it()   # Symbolically execute the command
    cmd = __CMD__
    tmps = __TMPS__  # Stop  gathering rapids commands
    __CMD__ = None
    __TMPS__ = None
    if self.is_local():  return self._data  # Local computation, all done

    # Remote computation - ship Rapids over wire, assigning key to result
    if tmps:
      cmd = "(, " + cmd + tmps + ")"
    j = h2o.rapids(cmd)
    if isinstance(self._data, unicode):
      pass  # Big Data Key is the result
    # Small data result pulled locally
    elif j['num_rows']:   # basically checks if num_rows is nonzero... sketchy.
      self._data = j['head']
    elif j['result'] in [u'TRUE', u'FALSE']:
      self._data = (j['result'] == u'TRUE')
    else:
      self._data = j['scalar']
    return self._data

Example #15

0

Show file

File: frame.py Project: VenkatForkedRepos/h2o-dev

  def tail(self, rows=10, cols=200, **kwargs):
    """
    Analgous to R's `tail` call on a data.frame. Display a digestible chunk of the H2OFrame starting from the end.

    :param rows: Number of rows to display.
    :param cols: Number of columns to display.
    :param kwargs: Extra arguments passed from other methods.
    :return: None
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    nrows = min(self.nrow(), rows)
    ncols = min(self.ncol(), cols)
    colnames = self.names()[0:ncols]

    exprs = [self[c][(self.nrow()-nrows):(self.nrow())] for c in range(ncols)]
    print "Last", str(nrows), "rows and first", str(ncols), "columns: "
    if nrows != 1:
      fr = H2OFrame.py_tmp_key()
      cbind = "(= !" + fr + " (cbind %"
      cbind += " %".join([expr.eager() for expr in exprs]) + "))"
      res = h2o.rapids(cbind)
      h2o.remove(fr)
      tail_rows = [range(self.nrow()-nrows+1, self.nrow() + 1, 1)]
      tail_rows += [rows[0:nrows] for rows in res["head"][0:ncols]]
      tail = zip(*tail_rows)
      print tabulate.tabulate(tail, headers=["Row ID"] + colnames)
    else:
      print tabulate.tabulate([[self.nrow()] + [expr.eager() for expr in exprs]], headers=["Row ID"] + colnames)
    print

Example #16

0

Show file

File: expr.py Project: darraghdog/h2o-dev

    def eager(self):
        """
    This forces a top-level execution, as needed, and produces a top-level result
    locally. Frames are returned and truncated to the standard preview response
    provided by rapids - 100 rows X 200 cols.
    :return: A key pointing to the big data object
    """
        if self.is_computed(): return self._data
        # Gather the computation path for remote work, or doit locally for local work
        global __CMD__, __TMPS__
        assert not __CMD__ and not __TMPS__
        __CMD__ = ""
        __TMPS__ = ""  # Begin gathering rapids commands
        self._do_it()  # Symbolically execute the command
        cmd = __CMD__
        tmps = __TMPS__  # Stop  gathering rapids commands
        __CMD__ = None
        __TMPS__ = None
        if self.is_local(): return self._data  # Local computation, all done

        # Remote computation - ship Rapids over wire, assigning key to result
        if tmps:
            cmd = "(, " + cmd + tmps + ")"
        j = h2o.rapids(cmd)
        if isinstance(self._data, unicode):
            pass  # Big Data Key is the result
        # Small data result pulled locally
        elif j['num_rows']:
            self._data = j['head']
        elif j['result'] in [u'TRUE', u'FALSE']:
            self._data = (j['result'] == u'TRUE')
        else:
            self._data = j['scalar']
        return self._data

Example #17

0

Show file

File: frame.py Project: vikasgoel2000/h2o-dev

 def var(self):
   """
   :return: The covariance matrix of the columns in this H2OFrame.
   """
   key = self.send_frame()
   tmp_key = H2OFrame.py_tmp_key()
   expr = "(= !{} (var %{} \"null\" %FALSE \"everything\"))".format(tmp_key,key)
   h2o.rapids(expr)
   # Remove h2o temp frame after var
   h2o.remove(key)
   j = h2o.frame(tmp_key)
   fr = j['frames'][0]
   rows = fr['rows']
   veckeys = fr['vec_keys']
   cols = fr['columns']
   colnames = [col['label'] for col in cols]
   return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))

Example #18

0

Show file

File: frame.py Project: letsflykite/h2o-dev

 def send_frame(dataset):
     """
     Send a frame description to H2O, returns a key.
     :param dataset: An H2OFrame object
     :return: A key
     """
     # Send over the frame
     fr = H2OFrame.py_tmp_key()
     cbind = "(= !" + fr + " (cbind %"
     cbind += " %".join([vec.get_expr().eager() for vec in dataset.vecs()]) + "))"
     h2o.rapids(cbind)
     # And frame columns
     colnames = "(colnames= %" + fr + " {(: #0 #" + str(len(dataset) - 1) + ")} {"
     cnames = ';'.join([vec.name() for vec in dataset.vecs()])
     colnames += cnames + "})"
     h2o.rapids(colnames)
     return fr

Example #19

0

Show file

File: frame.py Project: VenkatForkedRepos/h2o-dev

  def group_by(self,cols,a):
    """
    GroupBy
    :param cols: The columns to group on.
    :param a: A dictionary of aggregates having the following shape: \
    {"colname":[aggregate, column, naMethod]}\
    e.g.: {"bikes":["count", 0, "all"]}\

    The naMethod is one of "all", "ignore", or "rm", which specifies how to handle
    NAs that appear in columns that are being aggregated.

    "all" - include NAs
    "rm"  - exclude NAs
    "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.)
    :return: The group by frame.
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")"
    aggregates = copy.deepcopy(a)
    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()

    aggs = []

    # transform cols in aggregates to their indices...
    for k in aggregates:
      if isinstance(aggregates[k][1],str):
        aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1]))
      else:
        aggregates[k][1] = '#'+str(aggregates[k][1])
      aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])]
    aggs = "(agg {})".format(" ".join(aggs))

    expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs)
    h2o.rapids(expr)  # group by
    # Remove h2o temp frame after groupby
    h2o.remove(key)
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key)
    fr = j['frames'][0]       # Just the first (only) frame
    rows = fr['rows']         # Row count
    veckeys = fr['vec_ids']  # List of h2o vec keys
    cols = fr['columns']      # List of columns
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))

Example #20

0

Show file

File: pyunit_h2orapids.py Project: ysj89/h2o-3

def h2orapids():
    """
    Python API test: h2o.rapids(expr)
    """
    try:
        rapidTime = h2o.rapids("(getTimeZone)")["string"]
        print(str(rapidTime))
    except Exception as e:
        assert False, "h2o.rapids() command is not working."

Example #21

0

Show file

File: frame.py Project: darraghdog/h2o-dev

  def groupby(self,cols,a):
    """
    GroupBy
    :param cols: The columns to group on.
    :param a: A dictionary of aggregates having the following shape:
              {"colname":[aggregate, column, naMethod]}
              e.g.: {"bikes":["count", 0, "all"]}

              The naMethod is one of "all", "ignore", or "rm", which specifies how to handle
              NAs that appear in columns that are being aggregated.

              "all" - include NAs
              "rm"  - exclude NAs
              "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.)
    :return: The group by frame.
    """
    colnums = [str(self._find_idx(name)) for name in cols]
    rapids_series = "{"+";".join(colnums)+"}"
    aggregates = copy.deepcopy(a)
    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()

    nAggs = len(aggregates)
    aggs = []

    # transform cols in aggregates to their indices...
    for k in aggregates:
      if isinstance(aggregates[k][1],str):
        aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1]))
      else:
        aggregates[k][1] = '#'+str(aggregates[k][1])
      aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])]
    aggs = "(agg #{} {})".format(nAggs, " ".join(aggs))

    expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs)
    h2o.rapids(expr)  # group by
    j = h2o.frame(tmp_key)
    fr = j['frames'][0]    # Just the first (only) frame
    rows = fr['rows']      # Row count
    veckeys = fr['vec_keys']# List of h2o vec keys
    cols = fr['columns']   # List of columns
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))

Example #22

0

Show file

def test_gam_effective_parameters():
    h2o_data = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    h2o_data["C21"] = h2o_data["C21"].asfactor()

    gam = H2OGeneralizedAdditiveEstimator(family='binomial',
                                          gam_columns=["C11", "C12", "C13"],
                                          scale=[1, 1, 1],
                                          num_knots=[5, 6, 7],
                                          standardize=True,
                                          Lambda=[0],
                                          alpha=[0],
                                          max_iterations=3)
    gam.train(x=["C1", "C2"], y="C21", training_frame=h2o_data)

    assert gam.parms['solver']['input_value'] == 'AUTO'
    assert gam.parms['solver']['actual_value'] == "IRLSM"
    assert gam.parms['fold_assignment']['input_value'] == 'AUTO'
    assert gam.parms['fold_assignment']['actual_value'] is None

    try:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format(
            "sys.ai.h2o.algos.evaluate_auto_model_parameters", "false"))
        gam = H2OGeneralizedAdditiveEstimator(
            family='binomial',
            gam_columns=["C11", "C12", "C13"],
            scale=[1, 1, 1],
            num_knots=[5, 6, 7],
            standardize=True,
            Lambda=[0],
            alpha=[0],
            max_iterations=3)
        gam.train(x=["C1", "C2"], y="C21", training_frame=h2o_data)

        assert gam.parms['solver']['input_value'] == 'AUTO'
        assert gam.parms['solver']['actual_value'] == 'AUTO'
        assert gam.parms['fold_assignment']['input_value'] == 'AUTO'
        assert gam.parms['fold_assignment']['actual_value'] == 'AUTO'
    finally:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format(
            "sys.ai.h2o.algos.evaluate_auto_model_parameters", "true"))

Example #23

0

Show file

 def _eval_driver(self,top):
   exec_str = self._do_it(top)
   res = h2o.rapids(exec_str)
   if 'scalar' in res:  self._cache._data = res['scalar']
   if 'string' in res:  self._cache._data = res['string']
   if 'funstr' in res:  raise NotImplementedError
   if 'key'    in res:
     self._cache.nrows = res['num_rows']
     self._cache.ncols = res['num_cols']
   return self

Example #24

0

Show file

File: frame.py Project: vikasgoel2000/h2o-dev

 def send_frame(self):
   """
   Send a frame description to H2O, returns a key.
   :return: A key
   """
   # Send over the frame
   fr = H2OFrame.py_tmp_key()
   rapids_call = "(, "  # fold into a single rapids call
   cbind = "(= !" + fr + " (cbind '"
   cbind += "' '".join([vec._expr.eager() for vec in self._vecs]) + "')) "
   rapids_call += cbind
   # h2o.rapids(cbind)
   # And frame columns
   colnames = "(colnames= %" + fr + " (: #0 #" + str(len(self) - 1) + ") "
   cnames = "(slist \"" + '" "'.join([vec._name for vec in self._vecs]) +"\")"
   colnames += cnames
   rapids_call += colnames
   h2o.rapids(rapids_call)
   return fr

Example #25

0

Show file

File: frame.py Project: darraghdog/h2o-dev

  def _simple_frames_bin_op(self, data, op, r=False):
    if len(self) == 0: return self
    if isinstance(data, (H2OVec, H2OFrame)): self._len_check(data)

    # Construct rapids expression
    tmp_key = H2OFrame.py_tmp_key()
    key1 = self.send_frame()
    key2 = None
    if isinstance(data, H2OFrame):
      key2 = data.send_frame()
      arg2 = "%" + str(key2)

    elif isinstance(data, H2OVec):
      tmp_frame = H2OFrame(vecs=[data])
      key2 = tmp_frame.send_frame()
      arg2 = "%" + str(key2)

    elif isinstance(data, Expr):
      raise NotImplementedError

    elif isinstance(data, (int, float)):
      arg2 = "#" + str(data)

    elif isinstance(data, str):
      arg2 = "\"" + data + "\""

    else: raise NotImplementedError
    expr = "(= !{} (".format(tmp_key) + op + " %{} {}))".format(key1,arg2) if not r else \
      "(= !{} (".format(tmp_key) + op + " {} %{}))".format(arg2,key1)

    h2o.rapids(expr)
    # Remove h2o temp frames
    h2o.remove(key1)
    if key2: h2o.remove(key2)
    # Construct H2OFrame result
    j = h2o.frame(tmp_key)
    fr = j['frames'][0]
    rows = fr['rows']
    veckeys = fr['vec_keys']
    cols = fr['columns']
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))

Example #26

0

Show file

File: expr.py Project: ncatedra/h2o-3

 def _eval_driver(self,top):
   exec_str = self._do_it(top)
   res = h2o.rapids(exec_str)
   if 'scalar' in res:
     if isinstance(res['scalar'], list): self._cache._data = [float(x) for x in res['scalar']]
     else:                               self._cache._data = float(res['scalar'])
   if 'string' in res:  self._cache._data = res['string']
   if 'funstr' in res:  raise NotImplementedError
   if 'key'    in res:
     self._cache.nrows = res['num_rows']
     self._cache.ncols = res['num_cols']
   return self

Example #27

0

Show file

File: frame.py Project: ChiahungTai/h2o-dev

    def ddply(self, cols, fun):
        # Confirm all names present in dataset; collect column indices
        colnums = [str(self._find_idx(name)) for name in cols]
        rapids_series = "{" + ";".join(colnums) + "}"

        # Eagerly eval and send the cbind'd frame over
        key = self.send_frame()
        tmp_key = H2OFrame.py_tmp_key()
        expr = "(= !{} (h2o.ddply %{} {} {}))".format(tmp_key, key,
                                                      rapids_series, fun)
        h2o.rapids(expr)  # ddply in h2o
        # Remove h2o temp frame after ddply
        h2o.remove(key)
        # Make backing H2OVecs for the remote h2o vecs
        j = h2o.frame(tmp_key)  # Fetch the frame as JSON
        fr = j['frames'][0]  # Just the first (only) frame
        rows = fr['rows']  # Row count
        veckeys = fr['veckeys']  # List of h2o vec keys
        cols = fr['columns']  # List of columns
        colnames = [col['label'] for col in cols]
        return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))

Example #28

0

Show file

File: frame.py Project: OspreyX/h2o-dev

 def var(self):
   """
   :return: The covariance matrix of the columns in this H2OFrame.
   """
   if self._vecs is None or self._vecs == []:
     raise ValueError("Frame Removed")
   key = self.send_frame()
   tmp_key = H2OFrame.py_tmp_key()
   expr = "(= !{} (var %{} () %FALSE \"everything\"))".format(tmp_key,key)
   h2o.rapids(expr)
   # Remove h2o temp frame after var
   h2o.delete(key)
   j = h2o.frame(tmp_key)
   fr = j['frames'][0]
   rows = fr['rows']
   veckeys = fr['vec_ids']
   cols = fr['columns']
   colnames = [col['label'] for col in cols]
   vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows) # Peel the Vecs out of the returned Frame
   h2o.delete(tmp_key)
   return H2OFrame(vecs=vecs)

Example #29

0

Show file

File: pyunit_effective_parameters_glm.py Project: zoudongyang/h2o-3

def test_glm_effective_parameters():
    cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    predictors = ["displacement","power","weight","acceleration","year"]
    response_col = "economy_20mpg"
    family = "binomial"
    cars[response_col] = cars[response_col].asfactor()
    nfolds = random.randint(3,10)

    glm = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family)
    glm.train(x=predictors, y=response_col, training_frame=cars)
    assert glm.parms['fold_assignment']['input_value'] == 'AUTO'
    assert glm.parms['fold_assignment']['actual_value'] == 'Random'

    try:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.algos.evaluate_auto_model_parameters", "false"))
        glm = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family)
        glm.train(x=predictors, y=response_col, training_frame=cars)
        assert glm.parms['fold_assignment']['input_value'] == 'AUTO'
        assert glm.parms['fold_assignment']['actual_value'] == 'AUTO'
    finally:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.algos.evaluate_auto_model_parameters", "true"))

Example #30

0

Show file

File: frame.py Project: OspreyX/h2o-dev

  def send_frame(self):
    """
    Send a frame description to H2O, returns a key.

    :return: A key
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    # Send over the frame
    fr = H2OFrame.py_tmp_key()
    rapids_call = "(, "  # fold into a single rapids call
    cbind = "(gput " + fr + " (cbind %FALSE '"  # false flag means no deep copy!
    cbind += "' '".join([vec._expr.eager() for vec in self._vecs]) + "')) "
    rapids_call += cbind
    # h2o.rapids(cbind)
    # And frame columns
    colnames = "(colnames= %" + fr + " (: #0 #" + str(len(self) - 1) + ") "
    cnames = "(slist \"" + '" "'.join([vec._name for vec in self._vecs]) +"\")"
    colnames += cnames
    rapids_call += colnames
    h2o.rapids(rapids_call)
    return fr

Example #31

0

Show file

File: frame.py Project: OspreyX/h2o-dev

  def merge(self, other, allLeft=False, allRite=False):
    """
    Merge two datasets based on common column names

    :param other: Other dataset to merge.  Must have at least one column in common with self, and all columns in common are used as the merge key.  If you want to use only a subset of the columns in common, rename the other columns so the columns are unique in the merged result.
    :param allLeft: If true, include all rows from the left/self frame
    :param allRite: If true, include all rows from the right/other frame
    :return: Original self frame enhanced with merged columns and rows
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    for v0 in self._vecs:
      for v1 in other._vecs:
        if v0._name==v1._name: break
      if v0._name==v1._name: break
    else:
      raise ValueError("frames must have some columns in common to merge on")
    # Eagerly eval and send the cbind'd frame over
    lkey = self .send_frame()
    rkey = other.send_frame()
    tmp_key = H2OFrame.py_tmp_key()
    expr = "(= !{} (merge %{} %{} %{} %{}))".format(tmp_key,lkey,rkey,
                                                    "TRUE" if allLeft else "FALSE",
                                                    "TRUE" if allRite else "FALSE")
    # Remove h2o temp frame after merge
    expr2 = "(, "+expr+" (del %"+lkey+" #0) (del %"+rkey+" #0) )"

    h2o.rapids(expr2)       # merge in h2o
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key)  # Fetch the frame as JSON
    fr = j['frames'][0]     # Just the first (only) frame
    rows = fr['rows']       # Row count
    veckeys = fr['vec_ids']# List of h2o vec keys
    cols = fr['columns']    # List of columns
    colnames = [col['label'] for col in cols]
    vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows) # Peel the Vecs out of the returned Frame
    h2o.delete(tmp_key)
    return H2OFrame(vecs=vecs)

Example #32

0

Show file

File: expr.py Project: sudarshan4455/h2o-3

 def _eval_driver(self, top):
     exec_str = self._do_it(top)
     res = h2o.rapids(exec_str)
     if 'scalar' in res:
         if isinstance(res['scalar'], list):
             self._cache._data = [float(x) for x in res['scalar']]
         else:
             self._cache._data = None if res['scalar'] is None else float(
                 res['scalar'])
     if 'string' in res: self._cache._data = res['string']
     if 'funstr' in res: raise NotImplementedError
     if 'key' in res:
         self._cache.nrows = res['num_rows']
         self._cache.ncols = res['num_cols']
     return self

Example #33

0

Show file

File: expr.py Project: Vishnu24/h2o-3

 def _eval_driver(self,top):
   exec_str = self._do_it(top)
   res = h2o.rapids(exec_str)
   if 'scalar' in res:  self._data = res['scalar']
   if 'string' in res:  self._data = res['string']
   if 'funstr' in res:  raise NotImplementedError
   if 'key'    in res:
     self._set_rows(res['num_rows'])
     self._set_cols(res['num_cols'])
   # Now clear all internal DAG nodes, allowing GC to reclaim them
   self._clear_impl()
   # Enable this GC to trigger rapid R GC cycles, and rapid R clearing of
   # temps... to help debug GC issues.
   #gc.collect()
   return self

Example #34

0

Show file

File: frame.py Project: VenkatForkedRepos/h2o-dev

  def merge(self, other, allLeft=False, allRite=False):
    """
    Merge two datasets based on common column names

    :param other: Other dataset to merge.  Must have at least one column in common with self, and all columns in common are used as the merge key.  If you want to use only a subset of the columns in common, rename the other columns so the columns are unique in the merged result.
    :param allLeft: If true, include all rows from the left/self frame
    :param allRite: If true, include all rows from the right/other frame
    :return: Original self frame enhanced with merged columns and rows
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    for v0 in self._vecs:
      for v1 in other._vecs:
        if v0._name==v1._name: break
      if v0._name==v1._name: break
    else:
      raise ValueError("frames must have some columns in common to merge on")
    # Eagerly eval and send the cbind'd frame over
    lkey = self .send_frame()
    rkey = other.send_frame()
    tmp_key = H2OFrame.py_tmp_key()
    expr = "(= !{} (merge %{} %{} %{} %{}))".format(tmp_key,lkey,rkey,
                                                    "TRUE" if allLeft else "FALSE",
                                                    "TRUE" if allRite else "FALSE")
    # Remove h2o temp frame after merge
    expr2 = "(, "+expr+" (del %"+lkey+" #0) (del %"+rkey+" #0) )"

    h2o.rapids(expr2)       # merge in h2o
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key)  # Fetch the frame as JSON
    fr = j['frames'][0]     # Just the first (only) frame
    rows = fr['rows']       # Row count
    veckeys = fr['vec_ids']# List of h2o vec keys
    cols = fr['columns']    # List of columns
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))

Example #35

0

Show file

File: frame.py Project: vikasgoel2000/h2o-dev

  def head(self, rows=10, cols=200, **kwargs):
    nrows = min(self.nrow(), rows)
    ncols = min(self.ncol(), cols)
    colnames = self.names()[0:ncols]

    fr = H2OFrame.py_tmp_key()
    cbind = "(= !" + fr + " (cbind %"
    cbind += " %".join([vec._expr.eager() for vec in self]) + "))"
    res = h2o.rapids(cbind)
    h2o.remove(fr)
    head_rows = [range(1, nrows + 1, 1)]
    head_rows += [rows[0:nrows] for rows in res["head"][0:ncols]]
    head = zip(*head_rows)
    print "First", str(nrows), "rows and first", str(ncols), "columns: "
    print tabulate.tabulate(head, headers=["Row ID"] + colnames)
    print

Example #36

0

Show file

    def eager(self):
        """
    This forces a top-level execution, as needed, and produces a top-level result
    locally. Frames are returned and truncated to the standard preview response
    provided by rapids - 100 rows X 200 cols.

    :return: A key pointing to the big data object
    """
        if self.is_computed(): return self._data
        # Gather the computation path for remote work, or doit locally for local work
        global __CMD__, __TMPS__
        assert not __CMD__ and not __TMPS__
        __CMD__ = ""
        __TMPS__ = ""  # Begin gathering rapids commands
        dummy = self  # Force extra refcnt so we get a top-level assignment in do_it
        self._do_it()  # Symbolically execute the command
        cmd = __CMD__
        tmps = __TMPS__  # Stop  gathering rapids commands
        __CMD__ = None
        __TMPS__ = None
        if self.is_local(): return self._data  # Local computation, all done

        # Remote computation - ship Rapids over wire, assigning key to result
        if tmps:
            cmd = "(, " + cmd + tmps + ")"
        j = h2o.rapids(cmd)
        if j['result_type'] == 0:
            pass  # Big Data Key is the result
        # Small data result pulled locally
        elif j['num_rows']:  # basically checks if num_rows is nonzero... sketchy.
            self._data = j['head']
        elif j['result'] in [u'TRUE', u'FALSE']:
            self._data = (j['result'] == u'TRUE')
        elif j['result_type'] in [1, 2, 3, 4]:
            if isinstance(j['string'], str):
                self._data = j['string']
            if isinstance(j['string'], unicode):
                self._data = j['string'].encode('utf-8')
            else:
                if not hasattr(j['scalar'], '__len__'):
                    self._data = j['scalar']

        if j['result_type'] in [3, 4]:
            for key in j['vec_ids']:
                h2o.remove(key['name'])

        return self._data

Example #37

0

Show file

File: pyunit_coxph_force_mojo.py Project: timgates42/h2o-3

def coxph_force_mojo_categorical_interaction():
    heart = h2o.import_file(pyunit_utils.locate("smalldata/coxph_test/heart.csv"))
    heart[["surgery"]] = heart[["surgery"]].asfactor()

    coxph = H2OCoxProportionalHazardsEstimator(
        start_column="start",
        stop_column="stop",
        interactions=["age", "surgery"]
    )
    coxph.train(x=["age", "surgery"], y="event", training_frame=heart)
    print(coxph)

    # MOJO will not be enabled if interactions were used
    try:
        coxph.download_mojo()
        assert False, "Expected an error to be thrown"
    except H2OValueError as ex:
        assert "Export to MOJO not supported" == str(ex.args[0])
 
    # Show that just force-enabling MOJO won't help without model retraining
    try:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.coxph.mojo.forceEnable", "true"))
        coxph.download_mojo()
        assert False, "Expected an error to be thrown"
    except H2OValueError as ex:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.coxph.mojo.forceEnable", "false"))
        assert "Export to MOJO not supported" == str(ex.args[0])

    # The happy path: train model with mojo force-enabled and successfully download the MOJO
    try:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.coxph.mojo.forceEnable", "true"))
        coxph2 = H2OCoxProportionalHazardsEstimator(
            start_column="start",
            stop_column="stop",
            interactions=["age", "surgery"]
        )
        coxph2.train(x=["age", "surgery"], y="event", training_frame=heart)
        mojo = coxph2.download_mojo()
        assert mojo is not None
    finally:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.coxph.mojo.forceEnable", "false"))

Example #38

0

Show file

File: frame.py Project: vikasgoel2000/h2o-dev

  def tail(self, rows=10, cols=200, **kwargs):
    nrows = min(self.nrow(), rows)
    ncols = min(self.ncol(), cols)
    colnames = self.names()[0:ncols]

    exprs = [self[c][(self.nrow()-nrows):(self.nrow())] for c in range(ncols)]
    print "Last", str(nrows), "rows and first", str(ncols), "columns: "
    if nrows != 1:
      fr = H2OFrame.py_tmp_key()
      cbind = "(= !" + fr + " (cbind %"
      cbind += " %".join([expr.eager() for expr in exprs]) + "))"
      res = h2o.rapids(cbind)
      h2o.remove(fr)
      tail_rows = [range(self.nrow()-nrows+1, self.nrow() + 1, 1)]
      tail_rows += [rows[0:nrows] for rows in res["head"][0:ncols]]
      tail = zip(*tail_rows)
      print tabulate.tabulate(tail, headers=["Row ID"] + colnames)
    else:
      print tabulate.tabulate([[self.nrow()] + [expr.eager() for expr in exprs]], headers=["Row ID"] + colnames)
    print

Example #39

0

Show file

File: model_base.py Project: Ansonparkour/h2o-3

 def model_id(self, newid):
     oldid = self._id
     self._id = newid
     h2o.rapids('(rename "%s" "%s")' % (oldid, newid))

Example #40

0

Show file

File: frame.py Project: OspreyX/h2o-dev

  def impute(self,column,method,combine_method,by,inplace):
    """
    Impute a column in this H2OFrame.

    :param column: The column to impute
    :param method: How to compute the imputation value.
    :param combine_method: For even samples and method="median", how to combine quantiles.
    :param by: Columns to group-by for computing imputation value per groups of columns.
    :param inplace: Impute inplace?
    :return: the imputed frame.
    """
    # sanity check columns, get the column index
    col_id = -1

    if isinstance(column, list): column = column[0]  # only take the first one ever...

    if isinstance(column, (unicode,str)):
      col_id = self._find_idx(column)
    elif isinstance(column, int):
      col_id = column
    elif isinstance(column, H2OVec):
      try:
        col_id = [a._name==v._name for a in self].index(True)
      except:
        raise ValueError("No column found to impute.")

  # setup the defaults, "mean" for numeric, "mode" for enum
    if isinstance(method, list) and len(method) > 1:
      if self[col_id].isfactor(): method="mode"
      else:                       method="mean"
    elif isinstance(method, list):method=method[0]

    # choose "interpolate" by default for combine_method
    if isinstance(combine_method, list) and len(combine_method) > 1: combine_method="interpolate"
    if combine_method == "lo":                                       combine_method = "low"
    if combine_method == "hi":                                       combine_method = "high"

    # sanity check method
    if method=="median":
      # no by and median!
      if by is not None:
        raise ValueError("Unimplemented: No `by` and `median`. Please select a different method (e.g. `mean`).")

    # method cannot be median or mean for factor columns
    if self[col_id].isfactor() and method not in ["ffill", "bfill", "mode"]:
      raise ValueError("Column is categorical, method must not be mean or median.")


    # setup the group by columns
    gb_cols = "()"
    if by is not None:
      if not isinstance(by, list):          by = [by]  # just make it into a list...
      if isinstance(by[0], (unicode,str)):  by = [self._find_idx(name) for name in by]
      elif isinstance(by[0], int):          by = by
      elif isinstance(by[0], H2OVec):       by = [[a._name==v._name for a in self].index(True) for v in by]  # nested list comp. WOWZA
      else:                                 raise ValueError("`by` is not a supported type")

    if by is not None:                      gb_cols = "(llist #"+" #".join([str(b) for b in by])+")"

    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()

    if inplace:
      # frame, column, method, combine_method, gb_cols, inplace
      expr = "(h2o.impute %{} #{} \"{}\" \"{}\" {} %TRUE".format(key, col_id, method, combine_method, gb_cols)
      h2o.rapids(expr)  # exec the thing
      h2o.delete(key)  # "soft" delete of the frame key, keeps vecs live
      return self
    else:
      expr = "(= !{} (h2o.impute %{} #{} \"{}\" \"{}\" {} %FALSE))".format(tmp_key,key,col_id,method,combine_method,gb_cols)
      h2o.rapids(expr)  # exec the thing
      h2o.delete(key)
      # Make backing H2OVecs for the remote h2o vecs
      j = h2o.frame(tmp_key)
      fr = j['frames'][0]       # Just the first (only) frame
      rows = fr['rows']         # Row count
      veckeys = fr['vec_ids']   # List of h2o vec keys
      cols = fr['columns']      # List of columns
      colnames = [col['label'] for col in cols]
      vecs = H2OVec.new_vecs(zip(colnames, veckeys), rows) # Peel the Vecs out of the returned Frame
      h2o.delete(tmp_key)       # soft delete the new Frame, keep the imputed Vecs alive
      return H2OFrame(vecs=vecs)

Example #41

0

Show file

File: grid_search.py Project: madmax983/h2o-3

 def grid_id(self, value):
   oldname = self.grid_id
   self._id = value
   h2o.rapids("(rename \"{}\" \"{}\")".format(oldname, value))

Example #42

0

Show file

 def timezone(self, tz):
     assert_is_type(tz, str)
     h2o.rapids('(setTimeZone "%s")' % tz)

Example #43

0

Show file

File: cluster.py Project: Ansonparkour/h2o-3

 def timezone(self):
     """Current timezone of the H2O cluster."""
     return h2o.rapids("(getTimeZone)")["string"]

Example #44

0

Show file

File: cluster.py Project: Ansonparkour/h2o-3

 def timezone(self, tz):
     assert_is_type(tz, str)
     h2o.rapids('(setTimeZone "%s")' % tz)

Example #45

0

Show file

File: expr.py Project: ncatedra/h2o-3

 def __del__(self):
   if self._cache._id is not None and self._children is not None:
     h2o.rapids("(rm {})".format(self._cache._id))

Example #46

0

Show file

File: grid_search.py Project: tempbottle/h2o-3

 def grid_id(self, value):
     oldname = self.grid_id
     self._id = value
     h2o.rapids('(rename "{}" "{}")'.format(oldname, value))

Example #47

0

Show file

File: pyunit_h2orapids.py Project: StevenLOL/h2o-3

def h2orapids():
    """
    Python API test: h2o.rapids(expr)
    """
    rapidTime = h2o.rapids("(getTimeZone)")["string"]
    print(str(rapidTime))