def cbind(self,data): """ :param data: H2OFrame or H2OVec :return: new H2OFrame with data cbinded to the end """ # Check data type vecs = [] if isinstance(data,H2OFrame): vecs.append(self) [vecs.append(vec) for vec in data._vecs] elif isinstance(data,H2OVec): vecs = [self, data] else: raise ValueError("data parameter must be H2OVec or H2OFrame") names = [vec.name() for vec in vecs] fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind %" cbind += " %".join([vec._expr.eager() for vec in vecs]) + "))" h2o.rapids(cbind) j = h2o.frame(fr) fr = j['frames'][0] rows = fr['rows'] veckeys = fr['vec_ids'] cols = fr['columns'] colnames = [col['label'] for col in cols] result = H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows)) result.setNames(names) return result
def gbm_reweight_tree(): prostate_frame = h2o.import_file( path=pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate_frame["RACE"] = prostate_frame["RACE"].asfactor() prostate_frame["CAPSULE"] = prostate_frame["CAPSULE"].asfactor() x = ["AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "DPROS"] y = 'CAPSULE' gbm_model = H2OGradientBoostingEstimator() gbm_model.train(x=x, y=y, training_frame=prostate_frame) # 1. Get original contributions contribs_original = gbm_model.predict_contributions(prostate_frame) assert contribs_original.col_names == [ u'AGE', u'RACE', u'DPROS', u'DCAPS', u'PSA', u'VOL', u'GLEASON', u'BiasTerm' ] # 2. Scale weights => contributions should stay the same prostate_frame["weights"] = 2 h2o.rapids('(sharedtree.update.weights {} {} "{}")'.format( gbm_model.model_id, prostate_frame.frame_id, "weights")) contribs_reweighted = gbm_model.predict_contributions(prostate_frame) assert_frame_equal(contribs_reweighted.as_data_frame(), contribs_original.as_data_frame()) # 3. Reweight based on small subset of the data => contributions are expected to change prostate_subset = prostate_frame.head(10) h2o.rapids('(sharedtree.update.weights {} {} "{}")'.format( gbm_model.model_id, prostate_subset.frame_id, "weights")) contribs_subset = gbm_model.predict_contributions(prostate_subset) assert contribs_subset["BiasTerm"].min( ) != contribs_original["BiasTerm"].min()
def ddply(self,cols,fun): """ :param cols: Column names used to control grouping :param fun: Function to execute on each group. Right now limited to textual Rapids expression :return: New frame with 1 row per-group, of results from 'fun' """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") # Confirm all names present in dataset; collect column indices rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")" # Eagerly eval and send the cbind'd frame over key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (h2o.ddply %{} {} {}))".format(tmp_key,key,rapids_series,fun) h2o.rapids(expr) # ddply in h2o # Remove h2o temp frame after ddply h2o.remove(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) # Fetch the frame as JSON fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids']# List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def test_workaround_for_distribution(): try: h2o.rapids("(setproperty \"{}\" \"{}\")".format( "sys.ai.h2o.automl.algo_parameters.all.enabled", "true")) ds = import_dataset('regression') aml = H2OAutoML(project_name="py_test", algo_parameters=dict( distribution='poisson', family='poisson', ), exclude_algos=['StackedEnsemble'], max_runtime_secs=60, seed=1) aml.train(y=ds.target, training_frame=ds.train) model_names = [ aml.leaderboard[i, 0] for i in range(0, (aml.leaderboard.nrows)) ] for mn in model_names: m = h2o.get_model(mn) dist = m.params[ 'distribution'] if 'distribution' in m.params else m.params[ 'family'] if 'family' in m.params else None print("{}: distribution = {}".format(mn, dist)) except: h2o.rapids("(setproperty \"{}\" \"{}\")".format( "sys.ai.h2o.automl.algo_parameters.all.enabled", "false"))
def quantile(self, prob=None, combine_method="interpolate"): """ Compute quantiles over a given H2OFrame. :param prob: A list of probabilties, default is [0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]. You may provide any sequence of any length. :param combine_method: For even samples, how to combine quantiles. Should be one of ["interpolate", "average", "low", "hi"] :return: an H2OFrame containing the quantiles and probabilities. """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") if len(self) == 0: return self if not prob: prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99] if not isinstance(prob, list): raise ValueError("prob must be a list") probs = "(dlist #"+" #".join([str(p) for p in prob])+")" if combine_method not in ["interpolate","average","low","high"]: raise ValueError("combine_method must be one of: [" + ",".join(["interpolate","average","low","high"])+"]") key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (quantile '{}' {} '{}'".format(tmp_key,key,probs,combine_method) h2o.rapids(expr) # Remove h2o temp frame after groupby h2o.remove(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def test_isolation_forrest_effective_parameters(): train2 = h2o.import_file( pyunit_utils.locate("smalldata/anomaly/ecg_discord_train.csv")) if1 = H2OIsolationForestEstimator(ntrees=7, seed=12, sample_size=5, stopping_rounds=3, score_each_iteration=True) if1.train(training_frame=train2) if2 = H2OIsolationForestEstimator(ntrees=7, seed=12, sample_size=5, stopping_rounds=3, stopping_metric='anomaly_score', categorical_encoding="Enum", score_each_iteration=True) if2.train(training_frame=train2) assert if1.parms['stopping_metric']['input_value'] == 'AUTO' assert if1.parms['stopping_metric']['actual_value'] == if2.parms[ 'stopping_metric']['actual_value'] assert if1._model_json['output']['training_metrics']._metric_json[ 'mean_score'] == if2._model_json['output'][ 'training_metrics']._metric_json['mean_score'] assert if1.parms['categorical_encoding']['input_value'] == 'AUTO' assert if1.parms['categorical_encoding']['actual_value'] == if2.parms[ 'categorical_encoding']['actual_value'] try: h2o.rapids("(setproperty \"{}\" \"{}\")".format( "sys.ai.h2o.algos.evaluate_auto_model_parameters", "false")) if1 = H2OIsolationForestEstimator(ntrees=7, seed=12, sample_size=5, stopping_rounds=3, score_each_iteration=True) if1.train(training_frame=train2) if2 = H2OIsolationForestEstimator(ntrees=7, seed=12, sample_size=5, stopping_rounds=3, stopping_metric='anomaly_score', categorical_encoding="Enum", score_each_iteration=True) if2.train(training_frame=train2) assert if1.parms['stopping_metric']['input_value'] == 'AUTO' assert if1.parms['stopping_metric']['actual_value'] == 'AUTO' assert if1._model_json['output']['training_metrics']._metric_json[ 'mean_score'] == if2._model_json['output'][ 'training_metrics']._metric_json['mean_score'] assert if1.parms['categorical_encoding']['input_value'] == 'AUTO' assert if1.parms['categorical_encoding']['actual_value'] == 'AUTO' finally: h2o.rapids("(setproperty \"{}\" \"{}\")".format( "sys.ai.h2o.algos.evaluate_auto_model_parameters", "true"))
def frame_id(self, value): oldname = self.frame_id keep = self._ast is None if keep: h2o.assign(self,value) else: self._id = value h2o.rapids("(rename \"{}\" \"{}\")".format(oldname, value))
def pubdev_4863(): try: h2o.rapids("(tmp= digi_temp (cols_py 123STARTSWITHDIGITS 'a'))") assert False except H2OResponseError as error: print(error) assert 'Error: Name lookup of \'123STARTSWITHDIGITS\' failed' in str( error)
def xgboost_reweight_tree(): prostate_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate_frame["RACE"] = prostate_frame["RACE"].asfactor() prostate_frame["CAPSULE"] = prostate_frame["CAPSULE"].asfactor() x = ["AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "DPROS"] y = 'CAPSULE' xgb_model = H2OXGBoostEstimator() xgb_model.train(x=x, y=y, training_frame=prostate_frame) # 0. Save original MOJO oring_mojo_path = xgb_model.download_mojo() orig_mojo_str = h2o.print_mojo(oring_mojo_path) # 1. Get original contributions contribs_original = xgb_model.predict_contributions(prostate_frame) assert contribs_original.col_names == [ u'RACE.0', u'RACE.1', u'RACE.2', u'RACE.missing(NA)', u'AGE', u'DPROS', u'DCAPS', u'PSA', u'VOL', u'GLEASON', u'BiasTerm' ] # 2. Scale weights => contributions should stay the same weights_scale = 2 prostate_frame["weights"] = weights_scale h2o.rapids('(tree.update.weights {} {} "{}")'.format(xgb_model.model_id, prostate_frame.frame_id, "weights")) contribs_reweighted = xgb_model.predict_contributions(prostate_frame) assert_frame_equal(contribs_reweighted.as_data_frame(), contribs_original.as_data_frame(), check_less_precise=3) # 3. Reweight based on small subset of the data => contributions are expected to change prostate_subset = prostate_frame.head(10) h2o.rapids('(tree.update.weights {} {} "{}")'.format(xgb_model.model_id, prostate_subset.frame_id, "weights")) contribs_subset = xgb_model.predict_contributions(prostate_subset) assert contribs_subset["BiasTerm"].min() != contribs_original["BiasTerm"].min() # 4. Save modified mojo reweighted_mojo_path = xgb_model.download_mojo() reweighted_mojo_str = h2o.print_mojo(reweighted_mojo_path) # Sanity check assert orig_mojo_str != reweighted_mojo_str # Check first tree weight init_f = 1 / (1 + math.exp(0)) hess_coef = init_f * (1 - init_f) orig_trees = json.loads(orig_mojo_str) assert orig_trees["trees"][0]["root"]["weight"] == prostate_frame.nrow * hess_coef reweighted_trees = json.loads(reweighted_mojo_str) assert reweighted_trees["trees"][0]["root"]["weight"] == prostate_subset.nrow * hess_coef * weights_scale
def group_by(self,cols,a): """ GroupBy :param cols: The columns to group on. :param a: A dictionary of aggregates having the following shape: \ {"colname":[aggregate, column, naMethod]}\ e.g.: {"bikes":["count", 0, "all"]}\ The naMethod is one of "all", "ignore", or "rm", which specifies how to handle NAs that appear in columns that are being aggregated. "all" - include NAs "rm" - exclude NAs "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.) :return: The group by frame. """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")" aggregates = copy.deepcopy(a) key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() aggs = [] # transform cols in aggregates to their indices... for k in aggregates: if isinstance(aggregates[k][1],str): aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1])) else: aggregates[k][1] = '#'+str(aggregates[k][1]) aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])] aggs = "(agg {})".format(" ".join(aggs)) expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs) h2o.rapids(expr) # group by # Remove h2o temp frame after groupby h2o.delete(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows) # Peel the Vecs out of the returned Frame h2o.delete(tmp_key) return H2OFrame(vecs=vecs)
def quantile(self, prob=None): if len(self) == 0: return self if not prob: prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99] if not isinstance(prob, list): raise ValueError("prob must be a list") probs = "(dlist #"+" #".join([str(p) for p in prob])+")" key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (quantile '{}' {}".format(tmp_key, key, probs) h2o.rapids(expr) j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_keys'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def head(self, rows=10, cols=200, **kwargs): """ Analgous to R's `head` call on a data.frame. Display a digestible chunk of the H2OFrame starting from the beginning. :param rows: Number of rows to display. :param cols: Number of columns to display. :param kwargs: Extra arguments passed from other methods. :return: None """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") nrows = min(self.nrow(), rows) ncols = min(self.ncol(), cols) colnames = self.names()[0:ncols] fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind %" cbind += " %".join([vec._expr.eager() for vec in self]) + "))" res = h2o.rapids(cbind) h2o.remove(fr) head_rows = [range(1, nrows + 1, 1)] head_rows += [rows[0:nrows] for rows in res["head"][0:ncols]] head = zip(*head_rows) print "First", str(nrows), "rows and first", str(ncols), "columns: " print tabulate.tabulate(head, headers=["Row ID"] + colnames) print
def send_frame(self): """ Send a frame description to H2O, returns a key. :return: A key """ # Send over the frame fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind '" cbind += "' '".join([vec._expr.eager() for vec in self._vecs]) + "'))" h2o.rapids(cbind) # And frame columns colnames = "(colnames= %" + fr + " {(: #0 #" + str(len(self) - 1) + ")} {" cnames = ';'.join([vec._name for vec in self._vecs]) colnames += cnames + "})" h2o.rapids(colnames) return fr
def eager(self): """ This forces a top-level execution, as needed, and produces a top-level result locally. Frames are returned and truncated to the standard preview response provided by rapids - 100 rows X 200 cols. :return: A key pointing to the big data object """ if self.is_computed(): return self._data # Gather the computation path for remote work, or doit locally for local work global __CMD__, __TMPS__ assert not __CMD__ and not __TMPS__ __CMD__ = "" __TMPS__ = "" # Begin gathering rapids commands self._do_it() # Symbolically execute the command cmd = __CMD__ tmps = __TMPS__ # Stop gathering rapids commands __CMD__ = None __TMPS__ = None if self.is_local(): return self._data # Local computation, all done # Remote computation - ship Rapids over wire, assigning key to result if tmps: cmd = "(, " + cmd + tmps + ")" j = h2o.rapids(cmd) if isinstance(self._data, unicode): pass # Big Data Key is the result # Small data result pulled locally elif j['num_rows']: # basically checks if num_rows is nonzero... sketchy. self._data = j['head'] elif j['result'] in [u'TRUE', u'FALSE']: self._data = (j['result'] == u'TRUE') else: self._data = j['scalar'] return self._data
def tail(self, rows=10, cols=200, **kwargs): """ Analgous to R's `tail` call on a data.frame. Display a digestible chunk of the H2OFrame starting from the end. :param rows: Number of rows to display. :param cols: Number of columns to display. :param kwargs: Extra arguments passed from other methods. :return: None """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") nrows = min(self.nrow(), rows) ncols = min(self.ncol(), cols) colnames = self.names()[0:ncols] exprs = [self[c][(self.nrow()-nrows):(self.nrow())] for c in range(ncols)] print "Last", str(nrows), "rows and first", str(ncols), "columns: " if nrows != 1: fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind %" cbind += " %".join([expr.eager() for expr in exprs]) + "))" res = h2o.rapids(cbind) h2o.remove(fr) tail_rows = [range(self.nrow()-nrows+1, self.nrow() + 1, 1)] tail_rows += [rows[0:nrows] for rows in res["head"][0:ncols]] tail = zip(*tail_rows) print tabulate.tabulate(tail, headers=["Row ID"] + colnames) else: print tabulate.tabulate([[self.nrow()] + [expr.eager() for expr in exprs]], headers=["Row ID"] + colnames) print
def eager(self): """ This forces a top-level execution, as needed, and produces a top-level result locally. Frames are returned and truncated to the standard preview response provided by rapids - 100 rows X 200 cols. :return: A key pointing to the big data object """ if self.is_computed(): return self._data # Gather the computation path for remote work, or doit locally for local work global __CMD__, __TMPS__ assert not __CMD__ and not __TMPS__ __CMD__ = "" __TMPS__ = "" # Begin gathering rapids commands self._do_it() # Symbolically execute the command cmd = __CMD__ tmps = __TMPS__ # Stop gathering rapids commands __CMD__ = None __TMPS__ = None if self.is_local(): return self._data # Local computation, all done # Remote computation - ship Rapids over wire, assigning key to result if tmps: cmd = "(, " + cmd + tmps + ")" j = h2o.rapids(cmd) if isinstance(self._data, unicode): pass # Big Data Key is the result # Small data result pulled locally elif j['num_rows']: self._data = j['head'] elif j['result'] in [u'TRUE', u'FALSE']: self._data = (j['result'] == u'TRUE') else: self._data = j['scalar'] return self._data
def var(self): """ :return: The covariance matrix of the columns in this H2OFrame. """ key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (var %{} \"null\" %FALSE \"everything\"))".format(tmp_key,key) h2o.rapids(expr) # Remove h2o temp frame after var h2o.remove(key) j = h2o.frame(tmp_key) fr = j['frames'][0] rows = fr['rows'] veckeys = fr['vec_keys'] cols = fr['columns'] colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def send_frame(dataset): """ Send a frame description to H2O, returns a key. :param dataset: An H2OFrame object :return: A key """ # Send over the frame fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind %" cbind += " %".join([vec.get_expr().eager() for vec in dataset.vecs()]) + "))" h2o.rapids(cbind) # And frame columns colnames = "(colnames= %" + fr + " {(: #0 #" + str(len(dataset) - 1) + ")} {" cnames = ';'.join([vec.name() for vec in dataset.vecs()]) colnames += cnames + "})" h2o.rapids(colnames) return fr
def group_by(self,cols,a): """ GroupBy :param cols: The columns to group on. :param a: A dictionary of aggregates having the following shape: \ {"colname":[aggregate, column, naMethod]}\ e.g.: {"bikes":["count", 0, "all"]}\ The naMethod is one of "all", "ignore", or "rm", which specifies how to handle NAs that appear in columns that are being aggregated. "all" - include NAs "rm" - exclude NAs "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.) :return: The group by frame. """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")" aggregates = copy.deepcopy(a) key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() aggs = [] # transform cols in aggregates to their indices... for k in aggregates: if isinstance(aggregates[k][1],str): aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1])) else: aggregates[k][1] = '#'+str(aggregates[k][1]) aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])] aggs = "(agg {})".format(" ".join(aggs)) expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs) h2o.rapids(expr) # group by # Remove h2o temp frame after groupby h2o.remove(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def h2orapids(): """ Python API test: h2o.rapids(expr) """ try: rapidTime = h2o.rapids("(getTimeZone)")["string"] print(str(rapidTime)) except Exception as e: assert False, "h2o.rapids() command is not working."
def groupby(self,cols,a): """ GroupBy :param cols: The columns to group on. :param a: A dictionary of aggregates having the following shape: {"colname":[aggregate, column, naMethod]} e.g.: {"bikes":["count", 0, "all"]} The naMethod is one of "all", "ignore", or "rm", which specifies how to handle NAs that appear in columns that are being aggregated. "all" - include NAs "rm" - exclude NAs "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.) :return: The group by frame. """ colnums = [str(self._find_idx(name)) for name in cols] rapids_series = "{"+";".join(colnums)+"}" aggregates = copy.deepcopy(a) key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() nAggs = len(aggregates) aggs = [] # transform cols in aggregates to their indices... for k in aggregates: if isinstance(aggregates[k][1],str): aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1])) else: aggregates[k][1] = '#'+str(aggregates[k][1]) aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])] aggs = "(agg #{} {})".format(nAggs, " ".join(aggs)) expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs) h2o.rapids(expr) # group by j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_keys']# List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def test_gam_effective_parameters(): h2o_data = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() h2o_data["C21"] = h2o_data["C21"].asfactor() gam = H2OGeneralizedAdditiveEstimator(family='binomial', gam_columns=["C11", "C12", "C13"], scale=[1, 1, 1], num_knots=[5, 6, 7], standardize=True, Lambda=[0], alpha=[0], max_iterations=3) gam.train(x=["C1", "C2"], y="C21", training_frame=h2o_data) assert gam.parms['solver']['input_value'] == 'AUTO' assert gam.parms['solver']['actual_value'] == "IRLSM" assert gam.parms['fold_assignment']['input_value'] == 'AUTO' assert gam.parms['fold_assignment']['actual_value'] is None try: h2o.rapids("(setproperty \"{}\" \"{}\")".format( "sys.ai.h2o.algos.evaluate_auto_model_parameters", "false")) gam = H2OGeneralizedAdditiveEstimator( family='binomial', gam_columns=["C11", "C12", "C13"], scale=[1, 1, 1], num_knots=[5, 6, 7], standardize=True, Lambda=[0], alpha=[0], max_iterations=3) gam.train(x=["C1", "C2"], y="C21", training_frame=h2o_data) assert gam.parms['solver']['input_value'] == 'AUTO' assert gam.parms['solver']['actual_value'] == 'AUTO' assert gam.parms['fold_assignment']['input_value'] == 'AUTO' assert gam.parms['fold_assignment']['actual_value'] == 'AUTO' finally: h2o.rapids("(setproperty \"{}\" \"{}\")".format( "sys.ai.h2o.algos.evaluate_auto_model_parameters", "true"))
def _eval_driver(self,top): exec_str = self._do_it(top) res = h2o.rapids(exec_str) if 'scalar' in res: self._cache._data = res['scalar'] if 'string' in res: self._cache._data = res['string'] if 'funstr' in res: raise NotImplementedError if 'key' in res: self._cache.nrows = res['num_rows'] self._cache.ncols = res['num_cols'] return self
def send_frame(self): """ Send a frame description to H2O, returns a key. :return: A key """ # Send over the frame fr = H2OFrame.py_tmp_key() rapids_call = "(, " # fold into a single rapids call cbind = "(= !" + fr + " (cbind '" cbind += "' '".join([vec._expr.eager() for vec in self._vecs]) + "')) " rapids_call += cbind # h2o.rapids(cbind) # And frame columns colnames = "(colnames= %" + fr + " (: #0 #" + str(len(self) - 1) + ") " cnames = "(slist \"" + '" "'.join([vec._name for vec in self._vecs]) +"\")" colnames += cnames rapids_call += colnames h2o.rapids(rapids_call) return fr
def _simple_frames_bin_op(self, data, op, r=False): if len(self) == 0: return self if isinstance(data, (H2OVec, H2OFrame)): self._len_check(data) # Construct rapids expression tmp_key = H2OFrame.py_tmp_key() key1 = self.send_frame() key2 = None if isinstance(data, H2OFrame): key2 = data.send_frame() arg2 = "%" + str(key2) elif isinstance(data, H2OVec): tmp_frame = H2OFrame(vecs=[data]) key2 = tmp_frame.send_frame() arg2 = "%" + str(key2) elif isinstance(data, Expr): raise NotImplementedError elif isinstance(data, (int, float)): arg2 = "#" + str(data) elif isinstance(data, str): arg2 = "\"" + data + "\"" else: raise NotImplementedError expr = "(= !{} (".format(tmp_key) + op + " %{} {}))".format(key1,arg2) if not r else \ "(= !{} (".format(tmp_key) + op + " {} %{}))".format(arg2,key1) h2o.rapids(expr) # Remove h2o temp frames h2o.remove(key1) if key2: h2o.remove(key2) # Construct H2OFrame result j = h2o.frame(tmp_key) fr = j['frames'][0] rows = fr['rows'] veckeys = fr['vec_keys'] cols = fr['columns'] colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def _eval_driver(self,top): exec_str = self._do_it(top) res = h2o.rapids(exec_str) if 'scalar' in res: if isinstance(res['scalar'], list): self._cache._data = [float(x) for x in res['scalar']] else: self._cache._data = float(res['scalar']) if 'string' in res: self._cache._data = res['string'] if 'funstr' in res: raise NotImplementedError if 'key' in res: self._cache.nrows = res['num_rows'] self._cache.ncols = res['num_cols'] return self
def ddply(self, cols, fun): # Confirm all names present in dataset; collect column indices colnums = [str(self._find_idx(name)) for name in cols] rapids_series = "{" + ";".join(colnums) + "}" # Eagerly eval and send the cbind'd frame over key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (h2o.ddply %{} {} {}))".format(tmp_key, key, rapids_series, fun) h2o.rapids(expr) # ddply in h2o # Remove h2o temp frame after ddply h2o.remove(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) # Fetch the frame as JSON fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['veckeys'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def var(self): """ :return: The covariance matrix of the columns in this H2OFrame. """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (var %{} () %FALSE \"everything\"))".format(tmp_key,key) h2o.rapids(expr) # Remove h2o temp frame after var h2o.delete(key) j = h2o.frame(tmp_key) fr = j['frames'][0] rows = fr['rows'] veckeys = fr['vec_ids'] cols = fr['columns'] colnames = [col['label'] for col in cols] vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows) # Peel the Vecs out of the returned Frame h2o.delete(tmp_key) return H2OFrame(vecs=vecs)
def test_glm_effective_parameters(): cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) predictors = ["displacement","power","weight","acceleration","year"] response_col = "economy_20mpg" family = "binomial" cars[response_col] = cars[response_col].asfactor() nfolds = random.randint(3,10) glm = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family) glm.train(x=predictors, y=response_col, training_frame=cars) assert glm.parms['fold_assignment']['input_value'] == 'AUTO' assert glm.parms['fold_assignment']['actual_value'] == 'Random' try: h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.algos.evaluate_auto_model_parameters", "false")) glm = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family) glm.train(x=predictors, y=response_col, training_frame=cars) assert glm.parms['fold_assignment']['input_value'] == 'AUTO' assert glm.parms['fold_assignment']['actual_value'] == 'AUTO' finally: h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.algos.evaluate_auto_model_parameters", "true"))
def send_frame(self): """ Send a frame description to H2O, returns a key. :return: A key """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") # Send over the frame fr = H2OFrame.py_tmp_key() rapids_call = "(, " # fold into a single rapids call cbind = "(gput " + fr + " (cbind %FALSE '" # false flag means no deep copy! cbind += "' '".join([vec._expr.eager() for vec in self._vecs]) + "')) " rapids_call += cbind # h2o.rapids(cbind) # And frame columns colnames = "(colnames= %" + fr + " (: #0 #" + str(len(self) - 1) + ") " cnames = "(slist \"" + '" "'.join([vec._name for vec in self._vecs]) +"\")" colnames += cnames rapids_call += colnames h2o.rapids(rapids_call) return fr
def merge(self, other, allLeft=False, allRite=False): """ Merge two datasets based on common column names :param other: Other dataset to merge. Must have at least one column in common with self, and all columns in common are used as the merge key. If you want to use only a subset of the columns in common, rename the other columns so the columns are unique in the merged result. :param allLeft: If true, include all rows from the left/self frame :param allRite: If true, include all rows from the right/other frame :return: Original self frame enhanced with merged columns and rows """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") for v0 in self._vecs: for v1 in other._vecs: if v0._name==v1._name: break if v0._name==v1._name: break else: raise ValueError("frames must have some columns in common to merge on") # Eagerly eval and send the cbind'd frame over lkey = self .send_frame() rkey = other.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (merge %{} %{} %{} %{}))".format(tmp_key,lkey,rkey, "TRUE" if allLeft else "FALSE", "TRUE" if allRite else "FALSE") # Remove h2o temp frame after merge expr2 = "(, "+expr+" (del %"+lkey+" #0) (del %"+rkey+" #0) )" h2o.rapids(expr2) # merge in h2o # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) # Fetch the frame as JSON fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids']# List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows) # Peel the Vecs out of the returned Frame h2o.delete(tmp_key) return H2OFrame(vecs=vecs)
def _eval_driver(self, top): exec_str = self._do_it(top) res = h2o.rapids(exec_str) if 'scalar' in res: if isinstance(res['scalar'], list): self._cache._data = [float(x) for x in res['scalar']] else: self._cache._data = None if res['scalar'] is None else float( res['scalar']) if 'string' in res: self._cache._data = res['string'] if 'funstr' in res: raise NotImplementedError if 'key' in res: self._cache.nrows = res['num_rows'] self._cache.ncols = res['num_cols'] return self
def _eval_driver(self,top): exec_str = self._do_it(top) res = h2o.rapids(exec_str) if 'scalar' in res: self._data = res['scalar'] if 'string' in res: self._data = res['string'] if 'funstr' in res: raise NotImplementedError if 'key' in res: self._set_rows(res['num_rows']) self._set_cols(res['num_cols']) # Now clear all internal DAG nodes, allowing GC to reclaim them self._clear_impl() # Enable this GC to trigger rapid R GC cycles, and rapid R clearing of # temps... to help debug GC issues. #gc.collect() return self
def merge(self, other, allLeft=False, allRite=False): """ Merge two datasets based on common column names :param other: Other dataset to merge. Must have at least one column in common with self, and all columns in common are used as the merge key. If you want to use only a subset of the columns in common, rename the other columns so the columns are unique in the merged result. :param allLeft: If true, include all rows from the left/self frame :param allRite: If true, include all rows from the right/other frame :return: Original self frame enhanced with merged columns and rows """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") for v0 in self._vecs: for v1 in other._vecs: if v0._name==v1._name: break if v0._name==v1._name: break else: raise ValueError("frames must have some columns in common to merge on") # Eagerly eval and send the cbind'd frame over lkey = self .send_frame() rkey = other.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (merge %{} %{} %{} %{}))".format(tmp_key,lkey,rkey, "TRUE" if allLeft else "FALSE", "TRUE" if allRite else "FALSE") # Remove h2o temp frame after merge expr2 = "(, "+expr+" (del %"+lkey+" #0) (del %"+rkey+" #0) )" h2o.rapids(expr2) # merge in h2o # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) # Fetch the frame as JSON fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids']# List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def head(self, rows=10, cols=200, **kwargs): nrows = min(self.nrow(), rows) ncols = min(self.ncol(), cols) colnames = self.names()[0:ncols] fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind %" cbind += " %".join([vec._expr.eager() for vec in self]) + "))" res = h2o.rapids(cbind) h2o.remove(fr) head_rows = [range(1, nrows + 1, 1)] head_rows += [rows[0:nrows] for rows in res["head"][0:ncols]] head = zip(*head_rows) print "First", str(nrows), "rows and first", str(ncols), "columns: " print tabulate.tabulate(head, headers=["Row ID"] + colnames) print
def eager(self): """ This forces a top-level execution, as needed, and produces a top-level result locally. Frames are returned and truncated to the standard preview response provided by rapids - 100 rows X 200 cols. :return: A key pointing to the big data object """ if self.is_computed(): return self._data # Gather the computation path for remote work, or doit locally for local work global __CMD__, __TMPS__ assert not __CMD__ and not __TMPS__ __CMD__ = "" __TMPS__ = "" # Begin gathering rapids commands dummy = self # Force extra refcnt so we get a top-level assignment in do_it self._do_it() # Symbolically execute the command cmd = __CMD__ tmps = __TMPS__ # Stop gathering rapids commands __CMD__ = None __TMPS__ = None if self.is_local(): return self._data # Local computation, all done # Remote computation - ship Rapids over wire, assigning key to result if tmps: cmd = "(, " + cmd + tmps + ")" j = h2o.rapids(cmd) if j['result_type'] == 0: pass # Big Data Key is the result # Small data result pulled locally elif j['num_rows']: # basically checks if num_rows is nonzero... sketchy. self._data = j['head'] elif j['result'] in [u'TRUE', u'FALSE']: self._data = (j['result'] == u'TRUE') elif j['result_type'] in [1, 2, 3, 4]: if isinstance(j['string'], str): self._data = j['string'] if isinstance(j['string'], unicode): self._data = j['string'].encode('utf-8') else: if not hasattr(j['scalar'], '__len__'): self._data = j['scalar'] if j['result_type'] in [3, 4]: for key in j['vec_ids']: h2o.remove(key['name']) return self._data
def coxph_force_mojo_categorical_interaction(): heart = h2o.import_file(pyunit_utils.locate("smalldata/coxph_test/heart.csv")) heart[["surgery"]] = heart[["surgery"]].asfactor() coxph = H2OCoxProportionalHazardsEstimator( start_column="start", stop_column="stop", interactions=["age", "surgery"] ) coxph.train(x=["age", "surgery"], y="event", training_frame=heart) print(coxph) # MOJO will not be enabled if interactions were used try: coxph.download_mojo() assert False, "Expected an error to be thrown" except H2OValueError as ex: assert "Export to MOJO not supported" == str(ex.args[0]) # Show that just force-enabling MOJO won't help without model retraining try: h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.coxph.mojo.forceEnable", "true")) coxph.download_mojo() assert False, "Expected an error to be thrown" except H2OValueError as ex: h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.coxph.mojo.forceEnable", "false")) assert "Export to MOJO not supported" == str(ex.args[0]) # The happy path: train model with mojo force-enabled and successfully download the MOJO try: h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.coxph.mojo.forceEnable", "true")) coxph2 = H2OCoxProportionalHazardsEstimator( start_column="start", stop_column="stop", interactions=["age", "surgery"] ) coxph2.train(x=["age", "surgery"], y="event", training_frame=heart) mojo = coxph2.download_mojo() assert mojo is not None finally: h2o.rapids("(setproperty \"{}\" \"{}\")".format("sys.ai.h2o.coxph.mojo.forceEnable", "false"))
def tail(self, rows=10, cols=200, **kwargs): nrows = min(self.nrow(), rows) ncols = min(self.ncol(), cols) colnames = self.names()[0:ncols] exprs = [self[c][(self.nrow()-nrows):(self.nrow())] for c in range(ncols)] print "Last", str(nrows), "rows and first", str(ncols), "columns: " if nrows != 1: fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind %" cbind += " %".join([expr.eager() for expr in exprs]) + "))" res = h2o.rapids(cbind) h2o.remove(fr) tail_rows = [range(self.nrow()-nrows+1, self.nrow() + 1, 1)] tail_rows += [rows[0:nrows] for rows in res["head"][0:ncols]] tail = zip(*tail_rows) print tabulate.tabulate(tail, headers=["Row ID"] + colnames) else: print tabulate.tabulate([[self.nrow()] + [expr.eager() for expr in exprs]], headers=["Row ID"] + colnames) print
def model_id(self, newid): oldid = self._id self._id = newid h2o.rapids('(rename "%s" "%s")' % (oldid, newid))
def impute(self,column,method,combine_method,by,inplace): """ Impute a column in this H2OFrame. :param column: The column to impute :param method: How to compute the imputation value. :param combine_method: For even samples and method="median", how to combine quantiles. :param by: Columns to group-by for computing imputation value per groups of columns. :param inplace: Impute inplace? :return: the imputed frame. """ # sanity check columns, get the column index col_id = -1 if isinstance(column, list): column = column[0] # only take the first one ever... if isinstance(column, (unicode,str)): col_id = self._find_idx(column) elif isinstance(column, int): col_id = column elif isinstance(column, H2OVec): try: col_id = [a._name==v._name for a in self].index(True) except: raise ValueError("No column found to impute.") # setup the defaults, "mean" for numeric, "mode" for enum if isinstance(method, list) and len(method) > 1: if self[col_id].isfactor(): method="mode" else: method="mean" elif isinstance(method, list):method=method[0] # choose "interpolate" by default for combine_method if isinstance(combine_method, list) and len(combine_method) > 1: combine_method="interpolate" if combine_method == "lo": combine_method = "low" if combine_method == "hi": combine_method = "high" # sanity check method if method=="median": # no by and median! if by is not None: raise ValueError("Unimplemented: No `by` and `median`. Please select a different method (e.g. `mean`).") # method cannot be median or mean for factor columns if self[col_id].isfactor() and method not in ["ffill", "bfill", "mode"]: raise ValueError("Column is categorical, method must not be mean or median.") # setup the group by columns gb_cols = "()" if by is not None: if not isinstance(by, list): by = [by] # just make it into a list... if isinstance(by[0], (unicode,str)): by = [self._find_idx(name) for name in by] elif isinstance(by[0], int): by = by elif isinstance(by[0], H2OVec): by = [[a._name==v._name for a in self].index(True) for v in by] # nested list comp. WOWZA else: raise ValueError("`by` is not a supported type") if by is not None: gb_cols = "(llist #"+" #".join([str(b) for b in by])+")" key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() if inplace: # frame, column, method, combine_method, gb_cols, inplace expr = "(h2o.impute %{} #{} \"{}\" \"{}\" {} %TRUE".format(key, col_id, method, combine_method, gb_cols) h2o.rapids(expr) # exec the thing h2o.delete(key) # "soft" delete of the frame key, keeps vecs live return self else: expr = "(= !{} (h2o.impute %{} #{} \"{}\" \"{}\" {} %FALSE))".format(tmp_key,key,col_id,method,combine_method,gb_cols) h2o.rapids(expr) # exec the thing h2o.delete(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] vecs = H2OVec.new_vecs(zip(colnames, veckeys), rows) # Peel the Vecs out of the returned Frame h2o.delete(tmp_key) # soft delete the new Frame, keep the imputed Vecs alive return H2OFrame(vecs=vecs)
def grid_id(self, value): oldname = self.grid_id self._id = value h2o.rapids("(rename \"{}\" \"{}\")".format(oldname, value))
def timezone(self, tz): assert_is_type(tz, str) h2o.rapids('(setTimeZone "%s")' % tz)
def timezone(self): """Current timezone of the H2O cluster.""" return h2o.rapids("(getTimeZone)")["string"]
def __del__(self): if self._cache._id is not None and self._children is not None: h2o.rapids("(rm {})".format(self._cache._id))
def grid_id(self, value): oldname = self.grid_id self._id = value h2o.rapids('(rename "{}" "{}")'.format(oldname, value))
def h2orapids(): """ Python API test: h2o.rapids(expr) """ rapidTime = h2o.rapids("(getTimeZone)")["string"] print(str(rapidTime))