def ddply(self,cols,fun): """ :param cols: Column names used to control grouping :param fun: Function to execute on each group. Right now limited to textual Rapids expression :return: New frame with 1 row per-group, of results from 'fun' """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") # Confirm all names present in dataset; collect column indices rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")" # Eagerly eval and send the cbind'd frame over key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (h2o.ddply %{} {} {}))".format(tmp_key,key,rapids_series,fun) h2o.rapids(expr) # ddply in h2o # Remove h2o temp frame after ddply h2o.remove(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) # Fetch the frame as JSON fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids']# List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def predict(self, test_data): """ Predict on a dataset. :param test_data: Data to be predicted on. :return: A new H2OFrame filled with predictions. """ if not test_data: raise ValueError("Must specify test data") # cbind the test_data vecs together and produce a temp key test_data_key = H2OFrame.send_frame(test_data) # get the predictions # this job call is blocking j = H2OConnection.post_json("Predictions/models/" + self._key + "/frames/" + test_data_key) # retrieve the prediction frame prediction_frame_key = j["model_metrics"][0]["predictions"]["frame_id"]["name"] # get the actual frame meta dta pred_frame_meta = h2o.frame(prediction_frame_key)["frames"][0] # collect the vec_ids vec_ids = pred_frame_meta["vec_ids"] # get the number of rows rows = pred_frame_meta["rows"] # get the column names cols = [col["label"] for col in pred_frame_meta["columns"]] # create a set of H2OVec objects vecs = H2OVec.new_vecs(zip(cols, vec_ids), rows) # toast the cbound frame h2o.remove(test_data_key) # return a new H2OFrame object return H2OFrame(vecs=vecs)
def tail(self, rows=10, cols=200, **kwargs): """ Analgous to R's `tail` call on a data.frame. Display a digestible chunk of the H2OFrame starting from the end. :param rows: Number of rows to display. :param cols: Number of columns to display. :param kwargs: Extra arguments passed from other methods. :return: None """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") nrows = min(self.nrow(), rows) ncols = min(self.ncol(), cols) colnames = self.names()[0:ncols] exprs = [self[c][(self.nrow()-nrows):(self.nrow())] for c in range(ncols)] print "Last", str(nrows), "rows and first", str(ncols), "columns: " if nrows != 1: fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind %" cbind += " %".join([expr.eager() for expr in exprs]) + "))" res = h2o.rapids(cbind) h2o.remove(fr) tail_rows = [range(self.nrow()-nrows+1, self.nrow() + 1, 1)] tail_rows += [rows[0:nrows] for rows in res["head"][0:ncols]] tail = zip(*tail_rows) print tabulate.tabulate(tail, headers=["Row ID"] + colnames) else: print tabulate.tabulate([[self.nrow()] + [expr.eager() for expr in exprs]], headers=["Row ID"] + colnames) print
def glrm_iris(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() for trans in ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"]: rank = random.randint(1, 7) gx = random.uniform(0, 1) gy = random.uniform(0, 1) print( "H2O GLRM with rank k = " + str(rank) + ", gamma_x = " + str(gx) + ", gamma_y = " + str(gy) + ", transform = " + trans ) glrm_h2o = h2o.glrm(x=irisH2O, k=rank, loss="Quadratic", gamma_x=gx, gamma_y=gy, transform=trans) glrm_h2o.show() print("Impute original data from XY decomposition") pred_h2o = glrm_h2o.predict(irisH2O) pred_h2o.describe() h2o.remove(glrm_h2o._model_json["output"]["representation_name"])
def test_load_glrm(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() g_model = H2OGeneralizedLowRankEstimator(k=3) g_model.train(x=irisH2O.names, training_frame=irisH2O) yarch_old = g_model.archetypes() x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"]) predOld = g_model.predict(irisH2O) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results")) try: TMPDIR = pyunit_utils.locate("results") # find directory path to results folder except: os.makedirs(TMPDIR) h2o.save_model(g_model, path=TMPDIR, force=True) # save model full_path_filename = os.path.join(TMPDIR, g_model._id) h2o.remove(g_model) model_reloaded = h2o.load_model(full_path_filename) pred = model_reloaded.predict(irisH2O) yarch = model_reloaded.archetypes() x = h2o.get_frame(model_reloaded._model_json["output"]["representation_name"]) # assert difference between old and new are close, archetypes should be the same pyunit_utils.compare_frames_local(x, x_old, tol=1e-6) pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1) for k in range(3): pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps = 1e-4, tolerance=1e-10) print("glrm model successfully loaded...")
def head(self, rows=10, cols=200, **kwargs): """ Analgous to R's `head` call on a data.frame. Display a digestible chunk of the H2OFrame starting from the beginning. :param rows: Number of rows to display. :param cols: Number of columns to display. :param kwargs: Extra arguments passed from other methods. :return: None """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") nrows = min(self.nrow(), rows) ncols = min(self.ncol(), cols) colnames = self.names()[0:ncols] fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind %" cbind += " %".join([vec._expr.eager() for vec in self]) + "))" res = h2o.rapids(cbind) h2o.remove(fr) head_rows = [range(1, nrows + 1, 1)] head_rows += [rows[0:nrows] for rows in res["head"][0:ncols]] head = zip(*head_rows) print "First", str(nrows), "rows and first", str(ncols), "columns: " print tabulate.tabulate(head, headers=["Row ID"] + colnames) print
def describe(self): """ Generate an in-depth description of this H2OFrame. The description is a tabular print of the type, min, max, sigma, number of zeros, and number of missing elements for each H2OVec in this H2OFrame. :return: None (print to stdout) """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") print "Rows:", len(self._vecs[0]), "Cols:", len(self) headers = [vec._name for vec in self._vecs] table = [ self._row('type', None), self._row('mins', 0), self._row('mean', None), self._row('maxs', 0), self._row('sigma', None), self._row('zero_count', None), self._row('missing_count', None) ] chunk_summary_tmp_key = H2OFrame.send_frame(self) chunk_summary = h2o.frame(chunk_summary_tmp_key)["frames"][0]["chunk_summary"] h2o.remove(chunk_summary_tmp_key) print tabulate.tabulate(table, headers) print print chunk_summary print
def quantile(self, prob=None, combine_method="interpolate"): """ Compute quantiles over a given H2OFrame. :param prob: A list of probabilties, default is [0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]. You may provide any sequence of any length. :param combine_method: For even samples, how to combine quantiles. Should be one of ["interpolate", "average", "low", "hi"] :return: an H2OFrame containing the quantiles and probabilities. """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") if len(self) == 0: return self if not prob: prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99] if not isinstance(prob, list): raise ValueError("prob must be a list") probs = "(dlist #"+" #".join([str(p) for p in prob])+")" if combine_method not in ["interpolate","average","low","high"]: raise ValueError("combine_method must be one of: [" + ",".join(["interpolate","average","low","high"])+"]") key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (quantile '{}' {} '{}'".format(tmp_key,key,probs,combine_method) h2o.rapids(expr) # Remove h2o temp frame after groupby h2o.remove(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def cumsumminprodmax(): # TODO PUBDEV-1748 foo = h2o.H2OFrame(python_obj=[[x, y] for x, y in zip(range(10), range(9, -1, -1))]) foo.show() cumsum1 = foo[0].cumsum() cummin1 = foo[0].cummin() cumprod1 = foo[1:10, 0].cumprod() cummax1 = foo[0].cummax() cumsum2 = foo[1].cumsum() cummin2 = foo[1].cummin() cumprod2 = foo[0:9, 1].cumprod() cummax2 = foo[1].cummax() assert cumsum1[9, 0] == cumsum2[9, 0] == 45, "expected cumsums to be 45, but got {0} and {1}".format( cumsum1[9, 0], cumsum2[9, 0] ) assert cummin1[9, 0] == cummin2[9, 0] == 0, "expected cummin to be 0, but got {0} and {1}".format( cummin1[9, 0], cummin2[9, 0] ) assert cummax1[9, 0] == cummax2[9, 0] == 9, "expected cummin to be 9, but got {0} and {1}".format( cummin1[9, 0], cummin2[9, 0] ) cumprod1.show() print cumprod1.dim assert cumprod1[8, 0] == cumprod2[8, 0] == 362880, "expected cumprod to be 362880, but got {0} and " "{1}".format( cumprod1[8, 0], cumprod2[8, 0] ) h2o.remove(foo)
def model_performance(self, test_data=None, train=False, valid=False): """ Generate model metrics for this model on test_data. :param test_data: Data set for which model metrics shall be computed against. Both train and valid arguments are ignored if test_data is not None. :param train: Report the training metrics for the model. If the test_data is the training data, the training metrics are returned. :param valid: Report the validation metrics for the model. If train and valid are True, then it defaults to True. :return: An object of class H2OModelMetrics. """ if test_data is None: if not train and not valid: train = True # default to train if train: return self._model_json["output"]["training_metrics"] if valid: return self._model_json["output"]["validation_metrics"] else: # cases dealing with test_data not None if not isinstance(test_data, H2OFrame): raise ValueError("`test_data` must be of type H2OFrame. Got: " + type(test_data)) fr_key = H2OFrame.send_frame(test_data) res = H2OConnection.post_json("ModelMetrics/models/" + self._key + "/frames/" + fr_key) h2o.remove(fr_key) # FIXME need to do the client-side filtering... PUBDEV-874: https://0xdata.atlassian.net/browse/PUBDEV-874 raw_metrics = None for mm in res["model_metrics"]: if mm["frame"]["name"] == fr_key: raw_metrics = mm break return self._metrics_class(raw_metrics,algo=self._model_json["algo"])
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_airline_05_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/air05_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/air05_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1]))-1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc,col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv-startcsv)) print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def test_in(): iris = h2o.import_file(tests.locate("smalldata/iris/iris.csv")) assert 5.1 in iris[0], "expected 5.1 to be in the first column, but it wasn't" assert 1.7 in iris, "expected 1.7 to be in the dataset, but it wasn't" assert not 99 in iris, "didn't expect 99 to be in the dataset, but it was" assert "Iris-setosa" in iris[4], "expected Iris-setosa to be in the dataset, but it wasn't" h2o.remove(iris)
def _model_build(x,y,validation_x,validation_y,algo_url,kwargs): # Basic sanity checking if not x: raise ValueError("Missing features") x = _check_frame(x,y,y) if validation_x: validation_x = _check_frame(validation_x,validation_y,y) # Send frame descriptions to H2O cluster train_key = x.send_frame() kwargs['training_frame']=train_key if validation_x: valid_key = validation_x.send_frame() kwargs['validation_frame']=valid_key if y: kwargs['response_column']=y._name kwargs = dict([(k, kwargs[k]) for k in kwargs if kwargs[k] is not None]) # launch the job and poll job = H2OJob(H2OConnection.post_json("ModelBuilders/"+algo_url, **kwargs), job_type=(algo_url+" Model Build")).poll() model_json = H2OConnection.get_json("Models/"+job.dest_key)["models"][0] model_type = model_json["output"]["model_category"] if model_type=="Binomial": from model.binomial import H2OBinomialModel model = H2OBinomialModel(job.dest_key,model_json) elif model_type=="Clustering": from model.clustering import H2OClusteringModel model = H2OClusteringModel(job.dest_key,model_json) elif model_type=="Regression": from model.regression import H2ORegressionModel model = H2ORegressionModel(job.dest_key,model_json) elif model_type=="Multinomial": from model.multinomial import H2OMultinomialModel model = H2OMultinomialModel(job.dest_key,model_json) elif model_type=="AutoEncoder": from model.autoencoder import H2OAutoEncoderModel model = H2OAutoEncoderModel(job.dest_key,model_json) else: print model_type raise NotImplementedError # Cleanup h2o.remove(train_key) if validation_x: h2o.remove(valid_key) return model
def test_prod(): data = [[random.uniform(1,10)] for c in range(10)] h2o_data = h2o.H2OFrame(list(zip(*data))) np_data = np.array(data) h2o_prod = h2o_data.prod() np_prod = np.prod(np_data) assert abs(h2o_prod - np_prod) < 1e-06, "check unsuccessful! h2o computed {0} and numpy computed {1}. expected " \ "equal quantile values between h2o and numpy".format(h2o_prod,np_prod) h2o.remove(h2o_data)
def h2oremove(): """ Python API test: h2o.remove(x) """ # call with no arguments try: training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) assert_is_type(training_data, H2OFrame) h2o.remove(training_data) training_data.nrow # this command should generate an error since training_data should have been deleted. except Exception as e: assert_is_type(e, AttributeError) assert "object has no attribute" in e.args[0], "h2o.remove() command is not working."
def models_stress_test(): data = h2o.import_file(pyunit_utils.locate("smalldata/testng/airlines_train.csv")) # Ulimit of stress tests should be set low enough, e.g. 100, in case of file descriptors leaks, this test should fail. num_models = 1000 start = timer() for i in range(0,num_models): xgb = H2OXGBoostEstimator(ntrees = 1, max_depth= 2) xgb.train(x = ["Origin","Distance"],y="IsDepDelayed", training_frame=data) h2o.remove(xgb) end = timer() print ('Trained {} models in {} seconds.'.format(num_models, end - start))
def model_performance(self, test_data): """ Generate model metrics for this model on test_data. :param test_data: Data set for which model metrics shall be computed against. :return: An object of class H2OModelMetrics. """ if not test_data: raise ValueError("Missing`test_data`.") if not isinstance(test_data, H2OFrame): raise ValueError("`test_data` must be of type H2OFrame. Got: " + type(test_data)) fr_key = H2OFrame.send_frame(test_data) res = H2OConnection.post_json("ModelMetrics/models/" + self._key + "/frames/" + fr_key) h2o.remove(fr_key) raw_metrics = res["model_metrics"][0] return self._metrics_class(raw_metrics)
def pca_car(): num_runs = 10 run_time_c = [] car = h2o.import_file(path=pyunit_utils.locate("smalldata/pca_test/car.arff.txt")) # Nidhi: import may not work for run_index in range(num_runs): # multiple runs to get an idea of run time info carPCA = H2OPCA(k=car.ncols, transform="STANDARDIZE") carPCA.train(x=list(range(0, car.ncols)), training_frame=car) run_time_c.append(carPCA._model_json['output']['end_time']-carPCA._model_json['output']['start_time']) print("PCA model training time with car.arff.txt data in ms is {0}".format(run_time_c[run_index])) h2o.remove(carPCA) assert (max(run_time_c)) < 60000, "PCA runs for car.arff.txt take too much time!"
def __del__(self): # Dead pending op or local data; nothing to delete if self.is_pending() or self.is_local(): return assert self.is_remote(), "Data wasn't remote. Hrm..." global __CMD__ if __CMD__ is None: h2o.remove(self._data) else: s = " (del '" + self._data + "' #0)" global __TMPS__ if __TMPS__ is None: print "Lost deletes: ", s else: __TMPS__ += s
def glm_solvers(): predictors = ["displacement","power","weight","acceleration","year"] for solver in ["AUTO", "IRLSM", "L_BFGS", "COORDINATE_DESCENT_NAIVE", "COORDINATE_DESCENT"]: print("Solver = {0}".format(solver)) for family in ["binomial", "gaussian", "poisson", "tweedie", "gamma"]: if family == 'binomial': response_col = "economy_20mpg" elif family == 'gaussian': response_col = "economy" else: response_col = "cylinders" print("Family = {0}".format(family)) training_data = h2o.import_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) if family == 'binomial': training_data[response_col] = training_data[response_col].asfactor() else: training_data[response_col] = training_data[response_col].asnumeric() model = h2o.glm(x=training_data[predictors], y=training_data[response_col], family=family, alpha=[0], Lambda=[1e-5], solver=solver) h2o.remove(training_data)
def glrm_prostate_miss(): missing_ratios = np.arange(0.1, 1, 0.1).tolist() print("Importing prostate_cat.csv data and saving for validation...") prostate_full = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA"]*8) prostate_full.describe() totnas = 0 for i in range(prostate_full.ncol): totnas = totnas + prostate_full[i].isna().sum() totobs = prostate_full.nrow * prostate_full.ncol - totnas train_numerr = [0]*len(missing_ratios) valid_numerr = [0]*len(missing_ratios) train_caterr = [0]*len(missing_ratios) valid_caterr = [0]*len(missing_ratios) for i in range(len(missing_ratios)): ratio = missing_ratios[i] print("Importing prostate_cat.csv and inserting {0}% missing entries".format(100*ratio)) prostate_miss = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) prostate_miss = prostate_miss.insert_missing_values(fraction=ratio) prostate_miss.describe() print("H2O GLRM with {0}% missing entries".format(100*ratio)) prostate_glrm = h2o.glrm(x=prostate_miss, validation_frame=prostate_full, k=8, ignore_const_cols=False, loss="Quadratic", gamma_x=0.5, gamma_y=0.5, regularization_x="L1", regularization_y="L1", init="SVD", max_iterations=2000, min_step_size=1e-6) prostate_glrm.show() # Check imputed data and error metrics train_numcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['numcnt'] valid_numcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numcnt'] train_catcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['catcnt'] valid_catcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['catcnt'] assert valid_numcnt >= train_numcnt, "Number of non-missing numeric entries in training data should be less than or equal to validation data" assert valid_catcnt >= train_catcnt, "Number of non-missing categorical entries in training data should be less than or equal to validation data" assert (train_numcnt + valid_numcnt) < totobs, "Total non-missing numeric entries in training and validation data was {0}, but should be less than {1}".format(train_numcnt + valid_numcnt, totobs) assert (valid_numcnt + valid_catcnt) == totobs, "Number of non-missing entries in validation data was {0}, but should be {1}".format(valid_numcnt + valid_catcnt, totobs) train_numerr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['numerr'] valid_numerr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numerr'] train_caterr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['caterr'] valid_caterr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['caterr'] h2o.remove(prostate_glrm._model_json['output']['representation_name']) for i in range(len(missing_ratios)): print("Missing ratio: {0}% --> Training numeric error: {1}\tValidation numeric error: {2}".format(missing_ratios[i]*100, train_numerr[i], valid_numerr[i])) for i in range(len(missing_ratios)): print("Missing ratio: {0}% --> Training categorical error: {1}\tValidation categorical error: {2}".format(missing_ratios[i]*100, train_caterr[i], valid_caterr[i]))
def __del__(self): # Dead pending op or local data; nothing to delete if self.is_pending() or self.is_local(): return assert self.is_remote(), "Data wasn't remote. Hrm..." global __CMD__ if __CMD__ is None: if h2o is not None: h2o.remove(self._data) else: # Hard/deep remove of a Vec, built into a rapids expression s = " (del '" + self._data + "')" global __TMPS__ if __TMPS__ is None: print "Lost deletes: ", s else: __TMPS__ += s
def head(self, rows=10, cols=200, **kwargs): nrows = min(self.nrow(), rows) ncols = min(self.ncol(), cols) colnames = self.names()[0:ncols] fr = H2OFrame.py_tmp_key() cbind = "(= !" + fr + " (cbind %" cbind += " %".join([vec._expr.eager() for vec in self]) + "))" res = h2o.rapids(cbind) h2o.remove(fr) head_rows = [range(1, nrows + 1, 1)] head_rows += [rows[0:nrows] for rows in res["head"][0:ncols]] head = zip(*head_rows) print "First", str(nrows), "rows and first", str(ncols), "columns: " print tabulate.tabulate(head, headers=["Row ID"] + colnames) print
def import_folder(): """ This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv from and build another H2O frame from the multi-file orc parser using multiple orc files that are saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc. It will compare the two frames to make sure they are equal. :return: None if passed. Otherwise, an exception will be thrown. """ startcsv = time.time() multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_csv"), na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1]))-1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc")) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc"), col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv-startcsv)) print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
def var(self): """ :return: The covariance matrix of the columns in this H2OFrame. """ key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() expr = "(= !{} (var %{} \"null\" %FALSE \"everything\"))".format(tmp_key,key) h2o.rapids(expr) # Remove h2o temp frame after var h2o.remove(key) j = h2o.frame(tmp_key) fr = j['frames'][0] rows = fr['rows'] veckeys = fr['vec_keys'] cols = fr['columns'] colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def eager(self): """ This forces a top-level execution, as needed, and produces a top-level result locally. Frames are returned and truncated to the standard preview response provided by rapids - 100 rows X 200 cols. :return: A key pointing to the big data object """ if self.is_computed(): return self._data # Gather the computation path for remote work, or doit locally for local work global __CMD__, __TMPS__ assert not __CMD__ and not __TMPS__ __CMD__ = "" __TMPS__ = "" # Begin gathering rapids commands dummy = self # Force extra refcnt so we get a top-level assignment in do_it self._do_it() # Symbolically execute the command cmd = __CMD__ tmps = __TMPS__ # Stop gathering rapids commands __CMD__ = None __TMPS__ = None if self.is_local(): return self._data # Local computation, all done # Remote computation - ship Rapids over wire, assigning key to result if tmps: cmd = "(, " + cmd + tmps + ")" j = h2o.rapids(cmd) if j['result_type'] == 0: pass # Big Data Key is the result # Small data result pulled locally elif j['num_rows']: # basically checks if num_rows is nonzero... sketchy. self._data = j['head'] elif j['result'] in [u'TRUE', u'FALSE']: self._data = (j['result'] == u'TRUE') elif j['result_type'] in [1,2,3,4]: if isinstance(j['string'], str): self._data = j['string'] if isinstance(j['string'], unicode): self._data = j['string'].encode('utf-8') else: if not hasattr(j['scalar'], '__len__'): self._data = j['scalar'] if j['result_type'] in [3,4]: for key in j['vec_ids']: h2o.remove(key['name']) return self._data
def h2o_H2OFrame_prod(): """ Python API test: h2o.frame.H2OFrame.prod(na_rm=False) Copied from pyunit_prod.py """ data = [[random.uniform(1,10)] for c in range(10)] h2o_data = h2o.H2OFrame(data) np_data = np.array(data) h2o_prod = h2o_data.prod(na_rm=True) np_prod = np.prod(np_data) assert abs(h2o_prod - np_prod) < 1e-06, "check unsuccessful! h2o computed {0} and numpy computed {1}. expected " \ "equal quantile values between h2o and numpy".format(h2o_prod,np_prod) h2o.remove(h2o_data)
def group_by(self,cols,a): """ GroupBy :param cols: The columns to group on. :param a: A dictionary of aggregates having the following shape: \ {"colname":[aggregate, column, naMethod]}\ e.g.: {"bikes":["count", 0, "all"]}\ The naMethod is one of "all", "ignore", or "rm", which specifies how to handle NAs that appear in columns that are being aggregated. "all" - include NAs "rm" - exclude NAs "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.) :return: The group by frame. """ if self._vecs is None or self._vecs == []: raise ValueError("Frame Removed") rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")" aggregates = copy.deepcopy(a) key = self.send_frame() tmp_key = H2OFrame.py_tmp_key() aggs = [] # transform cols in aggregates to their indices... for k in aggregates: if isinstance(aggregates[k][1],str): aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1])) else: aggregates[k][1] = '#'+str(aggregates[k][1]) aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])] aggs = "(agg {})".format(" ".join(aggs)) expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs) h2o.rapids(expr) # group by # Remove h2o temp frame after groupby h2o.remove(key) # Make backing H2OVecs for the remote h2o vecs j = h2o.frame(tmp_key) fr = j['frames'][0] # Just the first (only) frame rows = fr['rows'] # Row count veckeys = fr['vec_ids'] # List of h2o vec keys cols = fr['columns'] # List of columns colnames = [col['label'] for col in cols] return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def remove_obj_client(): training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = range(3) + range(4,11) from h2o.estimators.glm import H2OGeneralizedLinearEstimator model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5) print model.model_id print model model.train(x=X,y=Y, training_frame=training_data) print model h2o.remove(model) print model h2o.remove(training_data) print training_data
def h2o_H2OFrame_all(): """ Python API test: h2o.frame.H2OFrame.all(), h2o.frame.H2OFrame.any() """ python_lists=[[True, False], [False, True], [True, True], [True, 'NA']] h2oframe = h2o.H2OFrame(python_obj=python_lists, na_strings=['NA']) # contains true and false assert not(h2oframe.all()), "h2o.H2OFrame.all() command is not working." # all elements are true or NA assert h2oframe.any(), "h2o.H2OFrame.any() command is not working." # all elements are true or NA h2o.remove(h2oframe) python_lists=[[True, True], [True, True], [True, True], [True, 'NA']] # check with one boolean level only h2oframe = h2o.H2OFrame(python_obj=python_lists, na_strings=['NA']) # contains true and false assert h2oframe.all(), "h2o.H2OFrame.all() command is not working." # all elements are true or NA assert h2oframe.any(), "h2o.H2OFrame.any() command is not working." # all elements are true or NA h2o.remove(h2oframe) python_lists=[[False, False], [False, False], [False, False], [False, 'NA']] # check with one boolean level only h2oframe = h2o.H2OFrame(python_obj=python_lists, na_strings=['NA']) # contains true and false assert not(h2oframe.all()), "h2o.H2OFrame.all() command is not working." # all elements are false or NA assert h2oframe.any(), "h2o.H2OFrame.any() command is not working." # all elements are true or NA
def pubdev_6603(): hf = h2o.H2OFrame(pd.DataFrame([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])) s1, s2 = hf.split_frame(ratios=[0.5], seed=1) h2o.remove([hf, s1, s2]) assert len(h2o.ls()) == 0
def run(dataset, config): log.info(f"\n**** H2O AutoML [v{h2o.__version__}] ****\n") save_metadata(config, version=h2o.__version__) # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', r2='r2', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) jvm_memory = str( round(config.max_mem_size_mb * 2 / 3)) + "M" # leaving 1/3rd of available memory for XGBoost log.info("Starting H2O cluster with %s cores, %s memory.", nthreads, jvm_memory) max_port_range = 49151 min_port_range = 1024 rnd_port = os.getpid() % (max_port_range - min_port_range) + min_port_range port = config.framework_params.get('_port', rnd_port) init_params = config.framework_params.get('_init', {}) if "logs" in config.framework_params.get('_save_artifacts', []): init_params['ice_root'] = output_subdir("logs", config) h2o.init(nthreads=nthreads, port=port, min_mem_size=jvm_memory, max_mem_size=jvm_memory, **init_params) import_kwargs = {} # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = None if version.parse(h2o.__version__) >= version.parse( "3.32.0.3" ): # previous versions may fail to parse correctly some rare arff files using single quotes as enum/string delimiters (pandas also fails on same datasets) import_kwargs['quotechar'] = '"' train = h2o.import_file(dataset.train.path, destination_frame=frame_name( 'train', config), **import_kwargs) if train.nlevels() != dataset.domains.cardinalities: h2o.remove(train) train = None import_kwargs['quotechar'] = "'" if not train: train = h2o.import_file(dataset.train.path, destination_frame=frame_name( 'train', config), **import_kwargs) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path, destination_frame=frame_name('test', config), **import_kwargs) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **training_params) monitor = ( BackendMemoryMonitoring( frequency_seconds=config.ext.monitoring.frequency_seconds, check_on_exit=True, verbosity=config.ext.monitoring.verbosity) if config.framework_params.get('_monitor_backend', False) # else contextlib.nullcontext # Py 3.7+ only else contextlib.contextmanager(lambda: (_ for _ in (0, )))()) with utils.Timer() as training: with monitor: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise FrameworkError( "H2O could not produce any model in the requested time.") with utils.Timer() as predict: preds = aml.predict(test) preds = extract_preds(preds, test, dataset=dataset) save_artifacts(aml, dataset=dataset, config=config) return result(output_file=config.output_predictions_file, predictions=preds.predictions, truth=preds.truth, probabilities=preds.probabilities, probabilities_labels=preds.probabilities_labels, models_count=len(aml.leaderboard), training_duration=training.duration, predict_duration=predict.duration) finally: con = h2o.connection() if con: # h2o.remove_all() con.close() if con.local_server: con.local_server.shutdown()