def test_load_glrm(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() g_model = H2OGeneralizedLowRankEstimator(k=3) g_model.train(x=irisH2O.names, training_frame=irisH2O) yarch_old = g_model.archetypes() x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"]) predOld = g_model.predict(irisH2O) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results")) try: TMPDIR = pyunit_utils.locate("results") # find directory path to results folder except: os.makedirs(TMPDIR) h2o.save_model(g_model, path=TMPDIR, force=True) # save model full_path_filename = os.path.join(TMPDIR, g_model._id) h2o.remove(g_model) model_reloaded = h2o.load_model(full_path_filename) pred = model_reloaded.predict(irisH2O) yarch = model_reloaded.archetypes() x = h2o.get_frame(model_reloaded._model_json["output"]["representation_name"]) # assert difference between old and new are close, archetypes should be the same pyunit_utils.compare_frames_local(x, x_old, tol=1e-6) pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1) for k in range(3): pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps = 1e-4, tolerance=1e-10) print("glrm model successfully loaded...")
def svd_1_golden(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) print("Compare with SVD") from h2o.transforms.decomposition import H2OSVD fitH2O = H2OSVD(nv=4, transform="NONE", max_iterations=2000) fitH2O.train(x=list(range(4)), training_frame=arrestsH2O) print("Compare singular values (D)") h2o_d = fitH2O._model_json["output"]["d"] r_d = [1419.06139509772, 194.825846110138, 45.6613376308754, 18.0695566224677] print("R Singular Values: {0}".format(r_d)) print("H2O Singular Values: {0}".format(h2o_d)) for r, h in zip(r_d, h2o_d): assert abs(r - h) < 1e-6, "H2O got {0}, but R got {1}".format(h, r) print("Compare right singular vectors (V)") h2o_v = h2o.as_list(h2o.get_frame(fitH2O._model_json["output"]["v_key"]["name"]), use_pandas=False) h2o_v.pop(0) r_v = [ [-0.04239181, 0.01616262, -0.06588426, 0.99679535], [-0.94395706, 0.32068580, 0.06655170, -0.04094568], [-0.30842767, -0.93845891, 0.15496743, 0.01234261], [-0.10963744, -0.12725666, -0.98347101, -0.06760284], ] print("R Right Singular Vectors: {0}".format(r_v)) print("H2O Right Singular Vectors: {0}".format(h2o_v)) for rl, hl in zip(r_v, h2o_v): for r, h in zip(rl, hl): assert abs(abs(r) - abs(float(h))) < 1e-5, "H2O got {0}, but R got {1}".format(h, r) print("Compare left singular vectors (U)") h2o_u = h2o.as_list(h2o.get_frame(fitH2O._model_json["output"]["u_key"]["name"]), use_pandas=False) h2o_u.pop(0) r_u = [ [-0.1716251, 0.096325710, 0.06515480, 0.15369551], [-0.1891166, 0.173452566, -0.42665785, -0.17801438], [-0.2155930, 0.078998111, 0.02063740, -0.28070784], [-0.1390244, 0.059889811, 0.01392269, 0.01610418], [-0.2067788, -0.009812026, -0.17633244, -0.21867425], [-0.1558794, -0.064555293, -0.28288280, -0.11797419], ] print("R Left Singular Vectors: {0}".format(r_u)) print("H2O Left Singular Vectors: {0}".format(h2o_u)) for rl, hl in zip(r_u, h2o_u): for r, h in zip(rl, hl): assert abs(abs(r) - abs(float(h))) < 1e-5, "H2O got {0}, but R got {1}".format(h, r)
def h2oget_frame(): """ Python API test: h2o.get_frame(frame_id) """ frame1 = h2o.import_file(pyunit_utils.locate("smalldata/jira/hexdev_29.csv")) frame2 = h2o.get_frame(frame1.frame_id) assert_is_type(frame2, H2OFrame)
def glrm_subset(): acs_orig = h2o.upload_file(path=pyunit_utils.locate("bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip"), col_types = (['enum'] + ['numeric']*149)) acs_full = acs_orig.drop("ZCTA5") acs_model = H2OGeneralizedLowRankEstimator(k = 10, transform = 'STANDARDIZE', loss = 'Quadratic', regularization_x = 'Quadratic', regularization_y = 'L1', gamma_x = 0.25, gamma_y = 0.5, max_iterations = 1) acs_model.train(x = acs_full.names, training_frame= acs_full) zcta_arch_x = h2o.get_frame(acs_model._model_json['output']['representation_name']) print (zcta_arch_x) acs_zcta_col = acs_orig["ZCTA5"].asfactor() idx = ((acs_zcta_col == '10065') | # Manhattan, NY (Upper East Side)\n", (acs_zcta_col == '11219') | # Manhattan, NY (East Harlem)\n", (acs_zcta_col == '66753') | # McCune, KS\n", (acs_zcta_col == '84104') | # Salt Lake City, UT\n", (acs_zcta_col == '94086') | # Sunnyvale, CA\n", (acs_zcta_col == '95014')) # Cupertino, CA\n", print(zcta_arch_x[idx,[0,1]])
def glrm_set_loss_by_col(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsPy = np.array(h2o.as_list(arrestsH2O)) arrestsH2O.describe() print("H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber") glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute","Huber"], loss_by_col_idx=[0,3], regularization_x="None", regularization_y="None") glrm_h2o.show() fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) print("Check final objective function value") fit_xy = np.dot(fit_x_np, fit_y_np) fit_diff = arrestsPy.__sub__(fit_xy) obj_val = np.absolute(fit_diff[:,0]) + np.square(fit_diff[:,1]) + np.square(fit_diff[:,2]) def huber(a): return a*a/2 if abs(a) <= 1 else abs(a)-0.5 huber = np.vectorize(huber) obj_val = obj_val + huber(fit_diff[:,3]) obj_val = np.sum(obj_val) glrm_obj = glrm_h2o._model_json['output']['objective'] assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(obj_val)
def pca_prostate(ip, port): h2o.init(ip, port) print "Importing prostate.csv data...\n" prostate = h2o.upload_file(h2o.locate("smalldata/logreg/prostate.csv")) print "Converting CAPSULE, RACE, DPROS and DCAPS columns to factors" prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate["RACE"] = prostate["RACE"].asfactor() prostate["DPROS"] = prostate["DPROS"].asfactor() prostate["DCAPS"] = prostate["DCAPS"].asfactor() prostate.describe() print "PCA on columns 3 to 9 with k = 3, retx = FALSE, transform = 'STANDARDIZE'" fitPCA = h2o.prcomp(x=prostate[2:9], k=3, transform="NONE", pca_method="Power") pred1 = fitPCA.predict(prostate) pred2 = h2o.get_frame(fitPCA._model_json['output']['loading_key']['name']) print "Compare dimensions of projection and loading matrix" print "Projection matrix:\n" print pred1.head() print "Loading matrix:\n" print pred2.head() assert pred1.nrow() == pred2.nrow(), "Expected same number of rows, but got {0} and {1}".format(pred1.nrow(), pred2.nrow()) assert pred1.ncol() == pred2.ncol(), "Expected same number of rows, but got {0} and {1}".format(pred1.ncol(), pred2.ncol())
def cross_validation_fold_assignment(self): """ Obtain the cross-validation fold assignment for all rows in the training data. :return: H2OFrame """ fid = self._model_json["output"]["cross_validation_fold_assignment_frame_id"] if fid is None: return None return h2o.get_frame(fid["name"])
def fit(self, fr, **fit_params): res = [] for step in self.steps: res.append(step[1].to_rest(step[0])) res = "[" + ",".join([_quoted(r.replace('"', "'")) for r in res]) + "]" j = H2OConnection.post_json(url_suffix="Assembly", steps=res, frame=fr.frame_id, _rest_version=99) self.id = j["assembly"]["name"] return get_frame(j["result"]["name"])
def cross_validation_holdout_predictions(self): """ Obtain the (out-of-sample) holdout predictions of all cross-validation models on the training data. This is equivalent to summing up all H2OFrames returned by cross_validation_predictions. :return: H2OFrame """ preds = self._model_json["output"]["cross_validation_holdout_predictions_frame_id"] if preds is None: return None return h2o.get_frame(preds["name"])
def deepfeatures(self, test_data, layer): """ Return hidden layer details :param test_data: Data to create a feature space on :param layer: 0 index hidden layer """ if test_data is None: raise ValueError("Must specify test data") j = H2OConnection.post_json("Predictions/models/" + self._id + "/frames/" + test_data._id, deep_features_hidden_layer=layer) return h2o.get_frame(j["predictions_frame"]["name"])
def predict(self, test_data): """ Predict on a dataset. :param test_data: Data to be predicted on. :return: A new H2OFrame filled with predictions. """ if not test_data: raise ValueError("Must specify test data") j = H2OConnection.post_json("Predictions/models/" + self._id + "/frames/" + test_data._id) prediction_frame_id = j["model_metrics"][0]["predictions"]["frame_id"]["name"] return h2o.get_frame(prediction_frame_id)
def deepfeatures(self, test_data, layer): """ Return hidden layer details :param test_data: Data to create a feature space on :param layer: 0 index hidden layer """ if test_data is None: raise ValueError("Must specify test data") j = H2OJob(h2o.H2OConnection.post_json("Predictions/models/" + self._id + "/frames/" + test_data.frame_id, deep_features_hidden_layer=layer, _rest_version=4), "deepfeatures") j.poll() return h2o.get_frame(j.dest_key)
def weights(self, matrix_id=0): """ Return the frame for the respective weight matrix :param: matrix_id: an integer, ranging from 0 to number of layers, that specifies the weight matrix to return. :return: an H2OFrame which represents the weight matrix identified by matrix_id """ num_weight_matrices = len(self._model_json['output']['weights']) if matrix_id not in list(range(num_weight_matrices)): raise ValueError("Weight matrix does not exist. Model has {0} weight matrices (0-based indexing), but matrix {1} " "was requested.".format(num_weight_matrices, matrix_id)) return h2o.get_frame(self._model_json['output']['weights'][matrix_id]['URL'].split('/')[3])
def biases(self, vector_id=0): """ Return the frame for the respective bias vector :param: vector_id: an integer, ranging from 0 to number of layers, that specifies the bias vector to return. :return: an H2OFrame which represents the bias vector identified by vector_id """ num_bias_vectors = len(self._model_json['output']['biases']) if vector_id not in list(range(num_bias_vectors)): raise ValueError("Bias vector does not exist. Model has {0} bias vectors (0-based indexing), but vector {1} " "was requested.".format(num_bias_vectors, vector_id)) return h2o.get_frame(self._model_json['output']['biases'][vector_id]['URL'].split('/')[3])
def cross_validation_predictions(self): """ Obtain the (out-of-sample) holdout predictions of all cross-validation models on their holdout data. Note that the predictions are expanded to the full number of rows of the training data, with 0 fill-in. :return: list of H2OFrame objects """ preds = self._model_json["output"]["cross_validation_predictions"] if preds is None: return None m = [] for p in preds: m.append(h2o.get_frame(p["name"])) return m
def predict(self, test_data): """ Predict on a dataset. :param test_data: Data to be predicted on. :return: A new H2OFrame filled with predictions. """ if not isinstance(test_data, H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id) # prediction_frame_id = j["predictions_frame"] #j["model_metrics"][0]["predictions"]["frame_id"]["name"] return h2o.get_frame(j["predictions_frame"]["name"])
def predict_leaf_node_assignment(self, test_data): """ Predict on a dataset and return the leaf node assignment (only for tree-based models). :param H2OFrame test_data: Data on which to make predictions. :returns: A new H2OFrame of predictions. """ if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id), data={"leaf_node_assignment": True}) return h2o.get_frame(j["predictions_frame"]["name"])
def glrm_nnmf(): m = 1000 n = 100 k = 10 print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n) Y = np.random.rand(k, n) X = np.random.rand(m, k) train = np.dot(X, Y) train_h2o = h2o.H2OFrame.fromPython(zip(*train.tolist())) print "Run GLRM with non-negative regularization" initial_y = np.random.rand(n, k) initial_y_h2o = h2o.H2OFrame.fromPython(initial_y.tolist()) glrm_h2o = H2OGeneralizedLowRankEstimator( k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="NonNegative", regularization_y="NonNegative", gamma_x=1, gamma_y=1, ) glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o) glrm_h2o.show() print "Check that X and Y matrices are non-negative" fit_y = glrm_h2o._model_json["output"]["archetypes"].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame(glrm_h2o._model_json["output"]["representation_name"]) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print "Check final objective function value" fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json["output"]["objective"] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json["output"]["training_metrics"]._metric_json["numerr"] glrm_caterr = glrm_h2o._model_json["output"]["training_metrics"]._metric_json["caterr"] assert abs(glrm_numerr - glrm_obj) < 1e-3, ( "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) ) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def deepfeatures(self, test_data, layer): """ Return hidden layer details. :param test_data: Data to create a feature space on :param layer: 0 index hidden layer """ if test_data is None: raise ValueError("Must specify test data") j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self._id, test_data.frame_id), data={"deep_features_hidden_layer": layer}), "deepfeatures") j.poll() return h2o.get_frame(j.dest_key)
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict h2o.save_model(glrmModel, TMPDIR) # save GLRM model glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME)) predict_model = glrmModel2.predict(test) for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() predict_model[col] = predict_model[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) print("Comparing mojo predict and h2o predict from saved model...") pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def glrm_unitonesparse(): m = 1000 n = 100 k = 10 print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)) Y = np.random.rand(k,n) def ind_list(k): tmp = [0] * k tmp[np.random.randint(0,k)] = 1 return tmp X = [ind_list(k) for x in range(m)] X = np.array(X) train = np.dot(X,Y) train_h2o = h2o.H2OFrame(list(zip(*train.tolist()))) print("Run GLRM with unit one-sparse regularization on X") initial_y = np.random.rand(k,n) initial_y_h2o = h2o.H2OFrame(list(zip(*initial_y.tolist()))) glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.train(x=train_h2o.names,training_frame=train_h2o) # glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.show() print("Check that X matrix consists of rows of basis vectors") fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) def is_basis(a): zeros = np.where(a == 0)[0].size ones = np.where(a == 1)[0].size basis = ones == 1 and (zeros + ones) == k assert basis, "Got " + str(ones) + " ones and " + str(zeros) + " zeros, but expected all zeros except a single 1" return basis np.apply_along_axis(is_basis, 1, fit_x_np) print("Check final objective function value") fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print("Impute XY and check error metrics") pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def test_levelone_frame_has_expected_dimensions(): ds = prepare_data(blending) models = train_base_models(ds) se = train_stacked_ensemble(ds, models, keep_levelone_frame=True) level_one_frame = h2o.get_frame(se.levelone_frame_id()["name"]) se_training_frame = ds.blend if blending else ds.train num_col_level_one_frame = (se_training_frame[ds.y].unique().nrow) * len(models) + 1 # count_classes(probabilities) * count_models + 1 (target) assert level_one_frame.ncols == num_col_level_one_frame, \ "The number of columns in a level one frame should be numClasses * numBaseModels + 1." assert level_one_frame.nrows == se_training_frame.nrows, \ "The number of rows in the level one frame should match train number of rows. "
def predict(self, test_data): """ Predict on a dataset. :param H2OFrame test_data: Data on which to make predictions. :returns: A new H2OFrame of predictions. """ if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = H2OJob(h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id)), self._model_json['algo'] + " prediction") j.poll() return h2o.get_frame(j.dest_key)
def glrm_simplex(): m = 1000 n = 100 k = 10 print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)) Y = np.random.rand(k,n) def ind_list(k): tmp = [0] * k tmp[np.random.randint(0,k)] = 1 return tmp X = [ind_list(k) for x in range(m)] X = np.array(X) train = np.dot(X,Y) train_h2o = h2o.H2OFrame(train.tolist()) print("Run GLRM with quadratic mixtures (simplex) regularization on X") initial_y = np.random.rand(k,n) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.train(x=train_h2o.names,training_frame=train_h2o) # glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.show() print("Check that X matrix consists of rows within standard probability simplex") fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) def is_simplex(a): row_sum = sum(a) simplex = abs(row_sum - 1) < 1e-6 assert simplex, "Got sum over row = " + row_sum + ", but expected 1" return simplex np.apply_along_axis(is_simplex, 1, fit_x_np) print("Check final objective function value") fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print("Impute XY and check error metrics") pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def proj_archetypes(self, test_data, reverse_transform=False): """ Convert archetypes of the model into original feature space. :param H2OFrame test_data: The dataset upon which the model was trained. :param bool reverse_transform: Whether the transformation of the training data during model-building should be reversed on the projected archetypes. :returns: model archetypes projected back into the original training data's feature space. """ if test_data is None or test_data.nrow == 0: raise ValueError("Must specify test data") j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id), data={"project_archetypes": True, "reverse_transform": reverse_transform}) return h2o.get_frame(j["model_metrics"][0]["predictions"]["frame_id"]["name"])
def anomaly(self, test_data, per_feature=False): """ Obtain the reconstruction error for the input test_data. :param H2OFrame test_data: The dataset upon which the reconstruction error is computed. :param bool per_feature: Whether to return the square reconstruction error per feature. Otherwise, return the mean square error. :returns: the reconstruction error. """ if test_data is None or test_data.nrow == 0: raise ValueError("Must specify test data") j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id), data={"reconstruction_error": True, "reconstruction_error_per_feature": per_feature}) return h2o.get_frame(j["model_metrics"][0]["predictions"]["frame_id"]["name"])
def reconstruct(self, test_data, reverse_transform=False): """ Reconstruct the training data from the model and impute all missing values. :param H2OFrame test_data: The dataset upon which the model was trained. :param bool reverse_transform: Whether the transformation of the training data during model-building should be reversed on the reconstructed frame. :returns: the approximate reconstruction of the training data. """ if test_data is None or test_data.nrow == 0: raise ValueError("Must specify test data") j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id), data={"reconstruct_train": True, "reverse_transform": reverse_transform}) return h2o.get_frame(j["model_metrics"][0]["predictions"]["frame_id"]["name"])
def predict_leaf_node_assignment(self, test_data): """ Predict on a dataset and return the leaf node assignment (only for tree-based models) Parameters ---------- test_data: H2OFrame Data on which to make predictions. Returns ------- A new H2OFrame of predictions. """ if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = h2o.H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id, leaf_node_assignment=True) return h2o.get_frame(j["predictions_frame"]["name"])
def predict(self, test_data): """ Predict on a dataset. Parameters ---------- test_data: H2OFrame Data on which to make predictions. Returns ------- A new H2OFrame of predictions. """ if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = H2OJob(h2o.H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id, _rest_version=4), self._model_json['algo'] + " prediction") j.poll() return h2o.get_frame(j.dest_key)
def predict(self, test_data): """ Predict on a dataset. Parameters ---------- test_data: H2OFrame Data on which to make predictions. Returns ------- A new H2OFrame of predictions. """ if not isinstance(test_data, H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id) # prediction_frame_id = j["predictions_frame"] #j["model_metrics"][0]["predictions"]["frame_id"]["name"] return h2o.get_frame(j["predictions_frame"]["name"])
def predict(self, test_data): """ Predict on a dataset. Parameters ---------- test_data: H2OFrame Data on which to make predictions. Returns ------- A new H2OFrame of predictions. """ if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = h2o.H2OConnection.post_json("Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id) # prediction_frame_id = j["predictions_frame"] #j["model_metrics"][0]["predictions"]["frame_id"]["name"] return h2o.get_frame(j["predictions_frame"]["name"])
def glrm_set_loss_by_col(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsPy = np.array(h2o.as_list(arrestsH2O)) arrestsH2O.describe() print( "H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber") glrm_h2o = H2OGeneralizedLowRankEstimator( k=3, loss="Quadratic", loss_by_col=["Absolute", "Huber"], loss_by_col_idx=[0, 3], regularization_x="None", regularization_y="None") glrm_h2o.train(x=arrestsH2O.names, training_frame=arrestsH2O) # glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute","Huber"], loss_by_col_idx=[0,3], regularization_x="None", regularization_y="None") glrm_h2o.show() fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame( glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) print("Check final objective function value") fit_xy = np.dot(fit_x_np, fit_y_np) fit_diff = arrestsPy.__sub__(fit_xy) obj_val = np.absolute(fit_diff[:, 0]) + np.square( fit_diff[:, 1]) + np.square(fit_diff[:, 2]) def huber(a): return a * a / 2 if abs(a) <= 1 else abs(a) - 0.5 huber = np.vectorize(huber) obj_val = obj_val + huber(fit_diff[:, 3]) obj_val = np.sum(obj_val) glrm_obj = glrm_h2o._model_json['output']['objective'] assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(obj_val)
def glrm_nnmf(): m = 1000 n = 100 k = 10 print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n) Y = np.random.rand(k,n) X = np.random.rand(m, k) train = np.dot(X,Y) train_h2o = h2o.H2OFrame(zip(*train.tolist())) print "Run GLRM with non-negative regularization" initial_y = np.random.rand(n,k) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="NonNegative", regularization_y="NonNegative", gamma_x=1, gamma_y=1) glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o) glrm_h2o.show() print "Check that X and Y matrices are non-negative" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print "Check final objective function value" fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def reconstruct(self, test_data, reverse_transform=False): """ Reconstruct the training data from the model and impute all missing values. :param H2OFrame test_data: The dataset upon which the model was trained. :param bool reverse_transform: Whether the transformation of the training data during model-building should be reversed on the reconstructed frame. :returns: the approximate reconstruction of the training data. """ if test_data is None or test_data.nrow == 0: raise ValueError("Must specify test data") j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id), data={ "reconstruct_train": True, "reverse_transform": reverse_transform }) return h2o.get_frame( j["model_metrics"][0]["predictions"]["frame_id"]["name"])
def transform(self, words, aggregate_method): """ Transform words (or sequences of words) to vectors using a word2vec model. :param str words: An H2OFrame made of a single column containing source words. :param str aggregate_method: Specifies how to aggregate sequences of words. If method is `NONE` then no aggregation is performed and each input word is mapped to a single word-vector. If method is 'AVERAGE' then input is treated as sequences of words delimited by NA. Each word of a sequences is internally mapped to a vector and vectors belonging to the same sentence are averaged and returned in the result. :returns: the approximate reconstruction of the training data. """ j = h2o.api("GET /3/Word2VecTransform", data={ 'model': self.model_id, 'words_frame': words.frame_id, 'aggregate_method': aggregate_method }) return h2o.get_frame(j["vectors_frame"]["name"])
def proj_archetypes(self, test_data, reverse_transform=False): """ Convert archetypes of the model into original feature space. :param H2OFrame test_data: The dataset upon which the model was trained. :param bool reverse_transform: Whether the transformation of the training data during model-building should be reversed on the projected archetypes. :returns: model archetypes projected back into the original training data's feature space. """ if test_data is None or test_data.nrow == 0: raise ValueError("Must specify test data") j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id), data={ "project_archetypes": True, "reverse_transform": reverse_transform }) return h2o.get_frame( j["model_metrics"][0]["predictions"]["frame_id"]["name"])
def anomaly(self, test_data, per_feature=False): """Obtain the reconstruction error for the input test_data. Parameters ---------- test_data : H2OFrame The dataset upon which the reconstruction error is computed. per_feature : bool Whether to return the square reconstruction error per feature. Otherwise, return the mean square error. Returns ------- Return the reconstruction error. """ if test_data is None or test_data.nrow == 0: raise ValueError("Must specify test data") j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id), data={"reconstruction_error": True, "reconstruction_error_per_feature": per_feature}) return h2o.get_frame(j["model_metrics"][0]["predictions"]["frame_id"]["name"])
def predict(self, test_data): """ Predict on a dataset. Parameters ---------- test_data: H2OFrame Data on which to make predictions. Returns ------- A new H2OFrame of predictions. """ if not isinstance(test_data, h2o.H2OFrame): raise ValueError("test_data must be an instance of H2OFrame") j = H2OJob( h2o.api("POST /4/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id)), self._model_json['algo'] + " prediction") j.poll() return h2o.get_frame(j.dest_key)
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) columns = params.get('columns') if columns is not None and len(columns) > 2: columns = json.loads(columns) df = df[columns] use_value = params.get('use') if use_value is not None and len(use_value) == 0: use_value = None df_cor = df.cor(na_rm=to_bool(params.get('na_rm')), use=use_value, method=params.get('method')) dest_frame_id = append_frame_id(frame_id, params.get('suffix')) h2o.assign(df_cor, dest_frame_id) return {'frame_id': dest_frame_id}
def mojo_model_test(): # GBM airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) gbm = H2OGradientBoostingEstimator(ntrees=1) gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines) original_model_filename = tempfile.mkdtemp() original_model_filename = gbm.download_mojo(original_model_filename) key = h2o.lazy_import(original_model_filename) fr = h2o.get_frame(key[0]) model = H2OGenericEstimator(model_key=fr) model.train() predictions = model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert model._model_json["output"]["variable_importances"] is not None assert len( model._model_json["output"]["variable_importances"]._cell_values) > 0 assert model._model_json["output"]["model_summary"] is not None assert len(model._model_json["output"]["model_summary"]._cell_values) > 0 # Test constructor generating the model from existing MOJO file model = H2OGenericEstimator.from_file(original_model_filename) assert model is not None predictions = model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert model._model_json["output"]["variable_importances"] is not None assert len( model._model_json["output"]["variable_importances"]._cell_values) > 0 assert model._model_json["output"]["model_summary"] is not None assert len(model._model_json["output"]["model_summary"]._cell_values) > 0 generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo") generic_mojo_filename = model.download_mojo(path=generic_mojo_filename) assert os.path.getsize(generic_mojo_filename) == os.path.getsize( original_model_filename)
def anomaly(self, test_data, per_feature=False): """ Obtain the reconstruction error for the input test_data. :param H2OFrame test_data: The dataset upon which the reconstruction error is computed. :param bool per_feature: Whether to return the square reconstruction error per feature. Otherwise, return the mean square error. :returns: the reconstruction error. :examples: >>> from h2o.estimators.deeplearning import H2OAutoEncoderEstimator >>> train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/train.csv.gz") >>> test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/mnist/test.csv.gz") >>> predictors = list(range(0,784)) >>> resp = 784 >>> train = train[predictors] >>> test = test[predictors] >>> ae_model = H2OAutoEncoderEstimator(activation="Tanh", ... hidden=[2], ... l1=1e-5, ... ignore_const_cols=False, ... epochs=1) >>> ae_model.train(x=predictors,training_frame=train) >>> test_rec_error = ae_model.anomaly(test) >>> test_rec_error >>> test_rec_error_features = ae_model.anomaly(test, per_feature=True) >>> test_rec_error_features """ if test_data is None or test_data.nrow == 0: raise ValueError("Must specify test data") j = h2o.api("POST /3/Predictions/models/%s/frames/%s" % (self.model_id, test_data.frame_id), data={ "reconstruction_error": True, "reconstruction_error_per_feature": per_feature }) return h2o.get_frame( j["model_metrics"][0]["predictions"]["frame_id"]["name"])
def retain_keys_test(): airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) gbm = H2OGradientBoostingEstimator(ntrees=1) gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines) h2o.remove_all([airlines.frame_id, gbm.model_id]) assert h2o.get_frame(airlines.frame_id) is not None assert h2o.get_model(gbm.model_id) is not None ## Test key not being retained when unspecified gbm = H2OGradientBoostingEstimator(ntrees=1) gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines) h2o.remove_all([airlines.frame_id]) h2o.ls() try: h2o.get_model(gbm.model_id) assert False except h2o.exceptions.H2OResponseError as e: assert e.args[0].dev_msg.find("not found for argument: key") != -1
def transform(self, frame, data_leakage_handling="None", noise=-1, seed=-1): """ Apply transformation to `te_columns` based on the encoding maps generated during `train()` method call. :param H2OFrame frame: to which frame we are applying target encoding transformations. :param str data_leakage_handling: Supported options: 1) "k_fold" - encodings for a fold are generated based on out-of-fold data. 2) "leave_one_out" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies. 3) "none" - we do not holdout anything. Using whole frame for training :param float noise: the amount of random noise added to the target encoding. This helps prevent overfitting. Defaults to 0.01 * range of y. :param int seed: a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1. :example: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> predictors = ["home.dest", "cabin", "embarked"] >>> response = "survived" >>> titanic[response] = titanic[response].asfactor() >>> fold_col = "kfold_column" >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234) >>> titanic_te = H2OTargetEncoderEstimator(k=35, ... f=25, ... data_leakage_handling="leave_one_out", ... blending=True) >>> titanic_te.train(x=predictors, ... y=response, ... training_frame=titanic) >>> transformed = titanic_te.transform(frame=titanic, ... data_leakage_handling="leave_one_out", ... seed=1234) """ output = h2o.api("GET /3/TargetEncoderTransform", data={'model': self.model_id, 'frame': frame.key, 'data_leakage_handling': data_leakage_handling, 'noise': noise, 'seed': seed}) return h2o.get_frame(output["name"])
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) input_columns = params.get("input_columns") if input_columns is None or len(input_columns) == 0: input_columns = df.col_names else: import json input_columns = json.loads(input_columns) from h2o.estimators import H2OKMeansEstimator kmeans_model = H2OKMeansEstimator( categorical_encoding=params.get("categorical_encoding"), estimate_k=to_bool(params.get("estimate_k")), fold_assignment=params.get("fold_assignment"), ignore_const_cols=to_bool(params.get("ignore_const_cols")), init=params.get("init"), k=int(params.get("k")), keep_cross_validation_fold_assignment=to_bool( params.get("keep_cross_validation_fold_assignment")), keep_cross_validation_models=to_bool( params.get("keep_cross_validation_models")), keep_cross_validation_predictions=to_bool( params.get("keep_cross_validation_predictions")), max_iterations=int(params.get("max_iterations")), max_runtime_secs=float(params.get("max_runtime_secs")), nfolds=int(params.get("nfolds")), score_each_iteration=to_bool(params.get("score_each_iteration")), seed=int(params.get("seed")), standardize=to_bool(params.get("standardize"))) kmeans_model.train(x=input_columns, training_frame=df) kmeans_model.show() save_model(params, kmeans_model.model_id) return {'frame_id': frame_id, 'model_id': kmeans_model.model_id}
def save_artifacts(automl, dataset, config): artifacts = config.framework_params.get('_save_artifacts', ['leaderboard']) try: lb = automl.leaderboard.as_data_frame() log.debug("Leaderboard:\n%s", lb.to_string()) if 'leaderboard' in artifacts: models_dir = make_subdir("models", config) write_csv(lb, os.path.join(models_dir, "leaderboard.csv")) if 'models' in artifacts: models_dir = make_subdir("models", config) all_models_se = next( (mid for mid in lb['model_id'] if mid.startswith("StackedEnsemble_AllModels")), None) mformat = 'mojo' if 'mojos' in artifacts else 'json' if all_models_se: save_model(all_models_se, dest_dir=models_dir, mformat=mformat) else: for mid in lb['model_id']: save_model(mid, dest_dir=models_dir, mformat=mformat) if 'models_predictions' in artifacts: predictions_dir = make_subdir("predictions", config) test = h2o.get_frame(frame_name('test', config)) for mid in lb['model_id']: model = h2o.get_model(mid) save_predictions(model, test, dataset=dataset, config=config, predictions_file=os.path.join( predictions_dir, mid, 'predictions.csv')) if 'logs' in artifacts: logs_dir = make_subdir("logs", config) h2o.download_all_logs(dirname=logs_dir) except: log.debug("Error when saving artifacts.", exc_info=True)
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) column_header = params.get('column_header') if len(column_header) > 0: df = df[int(column_header):] from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator glrm_model = H2OGeneralizedLowRankEstimator( expand_user_y=to_bool(params.get('expand_user_y')), gamma_x=float(params.get('gamma_x')), gamma_y=float(params.get('gamma_y')), ignore_const_cols=to_bool(params.get('ignore_const_cols')), impute_original=to_bool(params.get('impute_original')), init=str(params.get('init')), init_step_size=float(params.get('init_step_size')), k=int(params.get('k')), loss=str(params.get('loss')), max_iterations=int(params.get('max_iterations')), max_runtime_secs=float(params.get('max_runtime_secs')), max_updates=int(params.get('max_updates')), min_step_size=float(params.get('min_step_size')), multi_loss=str(params.get('multi_loss')), period=int(params.get('period')), recover_svd=to_bool(params.get('recover_svd')), regularization_x=str(params.get('regularization_x')), regularization_y=str(params.get('regularization_y')), score_each_iteration=to_bool(params.get('score_each_iteration')), seed=int(params.get('seed')), svd_method=str(params.get('svd_method'))) glrm_model.train(training_frame=df) glrm_model.show() save_model(params, glrm_model.model_id) return {'frame_id': frame_id, 'model_id': glrm_model.model_id}
def transform(self, words, aggregate_method): """ Transform words (or sequences of words) to vectors using a word2vec model. :param str words: An H2OFrame made of a single column containing source words. :param str aggregate_method: Specifies how to aggregate sequences of words. If method is `NONE` then no aggregation is performed and each input word is mapped to a single word-vector. If method is 'AVERAGE' then input is treated as sequences of words delimited by NA. Each word of a sequences is internally mapped to a vector and vectors belonging to the same sentence are averaged and returned in the result. :returns: the approximate reconstruction of the training data. :examples: >>> job_titles = h2o.import_file(("https://s3.amazonaws.com/h2o-public-test-data/smalldata/craigslistJobTitles.csv"), ... col_names = ["category", "jobtitle"], ... col_types = ["string", "string"], ... header = 1) >>> STOP_WORDS = ["ax","i","you","edu","s","t","m","subject","can","lines","re","what", ... "there","all","we","one","the","a","an","of","or","in","for","by","on", ... "but","is","in","a","not","with","as","was","if","they","are","this","and","it","have", ... "from","at","my","be","by","not","that","to","from","com","org","like","likes","so"] >>> words = job_titles.tokenize(" ") >>> words = words[(words.isna()) | (~ words.isin(STOP_WORDS)),:] >>> w2v_model = H2OWord2vecEstimator(epochs = 10) >>> w2v_model.train(training_frame=words) >>> job_title_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE") """ j = h2o.api("GET /3/Word2VecTransform", data={ 'model': self.model_id, 'words_frame': words.frame_id, 'aggregate_method': aggregate_method }) return h2o.get_frame(j["vectors_frame"]["name"])
def anomaly(self, test_data, per_feature=False): """Obtain the reconstruction error for the input test_data. Parameters ---------- test_data : H2OFrame The dataset upon which the reconstruction error is computed. per_feature : bool Whether to return the square reconstruction error per feature. Otherwise, return the mean square error. Returns ------- Return the reconstruction error. """ if test_data is None or test_data.nrow == 0: raise ValueError("Must specify test data") j = H2OConnection.post_json( "Predictions/models/" + self.model_id + "/frames/" + test_data.frame_id, reconstruction_error=True, reconstruction_error_per_feature=per_feature) return h2o.get_frame( j["model_metrics"][0]["predictions"]["frame_id"]["name"])
def transform(self, frame, data_leakage_handling="None", noise=-1, seed=-1): """ Apply transformation to `te_columns` based on the encoding maps generated during `trains()` method call. :param H2OFrame frame: to which frame we are applying target encoding transformations. :param str data_leakage_handling: Supported options: 1) "KFold" - encodings for a fold are generated based on out-of-fold data. 2) "LeaveOneOut" - leave one out. Current row's response value is subtracted from the pre-calculated per-level frequencies. 3) "None" - we do not holdout anything. Using whole frame for training :param float noise: the amount of random noise added to the target encoding. This helps prevent overfitting. Defaults to 0.01 * range of y. :param int seed: a random seed used to generate draws from the uniform distribution for random noise. Defaults to -1. :example: >>> targetEncoder = TargetEncoder(encoded_columns=te_columns, target_column=responseColumnName, blended_avg=True, inflection_point=10, smoothing=20) >>> encodedTrain = targetEncoder.transform(frame=trainFrame, data_leakage_handling="None", seed=1234, is_train_or_valid=True) """ output = h2o.api("GET /3/TargetEncoderTransform", data={'model': self.model_id, 'frame': frame.key, 'data_leakage_handling': data_leakage_handling, 'noise': noise, 'seed': seed}) return h2o.get_frame(output["name"])
def save_artifacts(automl, dataset, config): artifacts = config.framework_params.get('_save_artifacts', ['leaderboard']) try: lb = automl.leaderboard.as_data_frame() log.debug("Leaderboard:\n%s", lb.to_string()) if 'leaderboard' in artifacts: models_dir = output_subdir("models", config) write_csv(lb, os.path.join(models_dir, "leaderboard.csv")) if 'models' in artifacts: models_dir = output_subdir("models", config) all_models_se = next( (mid for mid in lb['model_id'] if mid.startswith("StackedEnsemble_AllModels")), None) mformat = 'mojo' if 'mojos' in artifacts else 'json' if all_models_se and mformat == 'mojo': save_model(all_models_se, dest_dir=models_dir, mformat=mformat) else: for mid in lb['model_id']: save_model(mid, dest_dir=models_dir, mformat=mformat) models_archive = os.path.join(models_dir, "models.zip") utils.zip_path(models_dir, models_archive) def delete(path, isdir): if path != models_archive and os.path.splitext( path)[1] in ['.json', '.zip']: os.remove(path) utils.walk_apply(models_dir, delete, max_depth=0) if 'models_predictions' in artifacts: predictions_dir = output_subdir("predictions", config) test = h2o.get_frame(frame_name('test', config)) for mid in lb['model_id']: model = h2o.get_model(mid) h2o_preds = model.predict(test) preds = extract_preds(h2o_preds, test, dataset=dataset) if preds.probabilities_labels is None: preds.probabilities_labels = preds.h2o_labels write_preds( preds, os.path.join(predictions_dir, mid, 'predictions.csv')) utils.zip_path( predictions_dir, os.path.join(predictions_dir, "models_predictions.zip")) def delete(path, isdir): if isdir: shutil.rmtree(path, ignore_errors=True) utils.walk_apply(predictions_dir, delete, max_depth=0) if 'logs' in artifacts: logs_dir = output_subdir("logs", config) logs_zip = os.path.join(logs_dir, "h2o_logs.zip") utils.zip_path(logs_dir, logs_zip) # h2o.download_all_logs(dirname=logs_dir) def delete(path, isdir): if isdir: shutil.rmtree(path, ignore_errors=True) elif path != logs_zip: os.remove(path) utils.walk_apply(logs_dir, delete, max_depth=0) except Exception: log.debug("Error when saving artifacts.", exc_info=True)
def execute(h2o, params, config): frame_id = config.get('frame_id') df = h2o.get_frame(frame_id) train = int(params.get('train_ratio')) test = params.get('test_ratio') if test is None or len(test) == 0: test = 0 else: test = int(test) valid = params.get('valid_ratio') if valid is None or len(valid) == 0: valid = 0 else: valid = int(valid) seed = params.get('seed') if seed is None or len(seed) == 0: seed = None else: seed = int(seed) train_ratio = train / (train + test + valid) test_ratio = test / (train + test + valid) valid_ratio = valid / (train + test + valid) if valid == 0 and test == 0: return {'frame_id': frame_id} elif valid == 0: df_train, df_test = df.split_frame(ratios=[train_ratio], seed=seed) df_valid = None elif test == 0: df_train, df_valid = df.split_frame(ratios=[train_ratio], seed=seed) df_test = None else: df_train, df_test, df_valid = df.split_frame( ratios=[train_ratio, test_ratio], seed=seed) train_frame_id = append_frame_id(frame_id, params.get('train_suffix')) h2o.assign(df_train, train_frame_id) if df_test is None: test_frame_id = None else: test_frame_id = append_frame_id(frame_id, params.get('test_suffix')) h2o.assign(df_test, test_frame_id) if df_valid is None: valid_frame_id = None else: valid_frame_id = append_frame_id(frame_id, params.get('valid_suffix')) h2o.assign(df_valid, valid_frame_id) return { 'frame_id': train_frame_id, 'train_frame_id': train_frame_id, 'test_frame_id': test_frame_id, 'valid_frame_id': valid_frame_id, }
def glrm_orthonnmf(): m = 1000 n = 100 k = 10 print "Uploading random uniform matrix with rows = " + str( m) + " and cols = " + str(n) Y = np.random.rand(k, n) X = np.random.rand(m, k) train = np.dot(X, Y) train_h2o = h2o.H2OFrame(zip(*train.tolist())) print "Run GLRM with orthogonal non-negative regularization on X, non-negative regularization on Y" initial_y = np.random.rand(k, n) initial_y_h2o = h2o.H2OFrame(zip(*initial_y.tolist())) glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="OneSparse", regularization_y="NonNegative", gamma_x=1, gamma_y=1) glrm_h2o.show() print "Check that X and Y matrices are non-negative" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame( glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print "Check that columns of X are orthogonal" xtx = np.dot(np.transpose(fit_x_np), fit_x_np) offdiag = np.extract(1 - np.eye(k), xtx) assert np.all( offdiag == 0), "All off diagonal elements of X'X must equal zero" print "Check final objective function value" fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose( pred_np, fit_xy ), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str( glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str( glrm_caterr) + " but should be zero" print "Run GLRM with orthogonal non-negative regularization on both X and Y" initial_y = np.random.rand(n, k) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="OneSparse", regularization_y="OneSparse", gamma_x=1, gamma_y=1) glrm_h2o.show() print "Check that X and Y matrices are non-negative" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame( glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print "Check that columns of X are orthogonal" xtx = np.dot(np.transpose(fit_x_np), fit_x_np) offdiag_x = np.extract(1 - np.eye(k), xtx) assert np.all( offdiag_x == 0), "All off diagonal elements of X'X must equal zero" print "Check that rows of Y are orthogonal" yyt = np.dot(fit_y_np, np.transpose(fit_y_np)) offdiag_y = np.extract(1 - np.eye(k), yyt) assert np.all( offdiag_y == 0), "All off diagonal elements of YY' must equal zero" print "Check final objective function value" fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose( pred_np, fit_xy ), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str( glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str( glrm_caterr) + " but should be zero"
print acs_model # In[ ]: # Plot objective function value each iteration acs_model_score = acs_model.score_history() plt.xlabel("Iteration") plt.ylabel("Objective") plt.title("Objective Function Value per Iteration") plt.plot(acs_model_score["iteration"], acs_model_score["objective"]) plt.show() # In[ ]: # Embedding of ZCTAs into archetypes (X) zcta_arch_x = h2o.get_frame( acs_model._model_json["output"]["representation_name"]) zcta_arch_x.head() # In[ ]: # Plot a few ZCTAs on the first two archetypes idx = ((acs_zcta_col == "10065") | # Manhattan, NY (Upper East Side) (acs_zcta_col == "11219") | # Manhattan, NY (East Harlem) (acs_zcta_col == "66753") | # McCune, KS (acs_zcta_col == "84104") | # Salt Lake City, UT (acs_zcta_col == "94086") | # Sunnyvale, CA (acs_zcta_col == "95014")) # Cupertino, CA city_arch = np.array(h2o.as_list(zcta_arch_x[idx, [0, 1]])) plt.xlabel("First Archetype") plt.ylabel("Second Archetype")
def encoding_map_frames(self): return list( map(lambda x: get_frame(x['key']['name']), self._encodingMap.frames))
def glrm_simplex(): m = 1000 n = 100 k = 10 print "Uploading random uniform matrix with rows = " + str( m) + " and cols = " + str(n) Y = np.random.rand(k, n) def ind_list(k): tmp = [0] * k tmp[np.random.randint(0, k)] = 1 return tmp X = [ind_list(k) for x in xrange(m)] X = np.array(X) train = np.dot(X, Y) train_h2o = h2o.H2OFrame(train.tolist()) print "Run GLRM with quadratic mixtures (simplex) regularization on X" initial_y = np.random.rand(k, n) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o) # glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.show() print "Check that X matrix consists of rows within standard probability simplex" fit_x = h2o.get_frame( glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) def is_simplex(a): row_sum = sum(a) simplex = abs(row_sum - 1) < 1e-6 assert simplex, "Got sum over row = " + row_sum + ", but expected 1" return simplex np.apply_along_axis(is_simplex, 1, fit_x_np) print "Check final objective function value" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose( pred_np, fit_xy ), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str( glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str( glrm_caterr) + " but should be zero"
# OPTIONAL: plot the objective function score at each iteration model_score = model.score_history() plt.xlabel("Iteration") plt.ylabel("Objective") plt.title("Objective Function Value per Iteration") print (model_score) plt.plot(model_score["iterations"], model_score["objective"]) plt.show() plt.savefig('modelScore.jpg') # STEP 5: Recover the X and Y features and save them into csv File # Idk why its not so straightforward to Recover X and Y but this works... # The outputs are X, a numRows x rank Array, and Y a rank x numCols Y = model.proj_archetypes(Data) x_key = model._model_json["output"]["representation_name"] X = h2o.get_frame(x_key) Y = h2o.as_list(Y) X = h2o.as_list(X) Y.to_csv('outputY.csv', index = False) X.to_csv('outputX.csv', index = False) # Shut down the cluster after use h2o.shutdown(prompt=False) # In[ ]:
def transform(self, frame, blending=None, inflection_point=None, smoothing=None, noise=None, as_training=False, **kwargs): """ Apply transformation to `te_columns` based on the encoding maps generated during `train()` method call. :param H2OFrame frame: the frame on which to apply the target encoding transformations. :param boolean blending: If provided, this overrides the `blending` parameter on the model. :param float inflection_point: If provided, this overrides the `inflection_point` parameter on the model. :param float smoothing: If provided, this overrides the `smoothing` parameter on the model. :param float noise: If provided, this overrides the amount of random noise added to the target encoding defined on the model, this helps prevent overfitting. :param boolean as_training: Must be set to True when encoding the training frame. Defaults to False. :example: >>> titanic = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv") >>> predictors = ["home.dest", "cabin", "embarked"] >>> response = "survived" >>> titanic[response] = titanic[response].asfactor() >>> fold_col = "kfold_column" >>> titanic[fold_col] = titanic.kfold_column(n_folds=5, seed=1234) >>> titanic_te = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out", ... inflection_point=35, ... smoothing=25, ... blending=True, ... seed=1234) >>> titanic_te.train(x=predictors, ... y=response, ... training_frame=titanic) >>> transformed = titanic_te.transform(frame=titanic) """ for k in kwargs: if k in ['seed', 'data_leakage_handling']: warnings.warn( "`%s` is deprecated in `transform` method and will be ignored. " "Instead, please ensure that it was set before training on the H2OTargetEncoderEstimator model." % k, H2ODeprecationWarning) else: raise TypeError( "transform() got an unexpected keyword argument '%s'" % k) if 'data_leakage_handling' in kwargs: dlh = kwargs['data_leakage_handling'] assert_is_type(dlh, None, Enum("leave_one_out", "k_fold", "none")) if dlh is not None and dlh.lower() != "none": warnings.warn( "Deprecated `data_leakage_handling=%s` is replaced by `as_training=True`. " "Please update your code." % dlh, H2ODeprecationWarning) as_training = True params = dict( model=self.model_id, frame=frame.key, blending=blending if blending is not None else self. blending, # always need to provide blending here as we can't represent unset value inflection_point=inflection_point, smoothing=smoothing, noise=noise, as_training=as_training, ) output = h2o.api("GET /3/TargetEncoderTransform", data=params) return h2o.get_frame(output["name"])
def execute(self, name, x, y, training_frame, validation_frame, test_frame, subset_coef): params = grid.ParameterGrid(self.params_grid) if self.params_grid is None or len(self.params_grid) == 0: params = ["default"] results = [] dt = datetime.datetime # R stuff ri.initr() h2or = importr("h2o") h2o_ensemble = importr("h2oEnsemble") base = importr("base") stats = importr("stats") cvauc = importr("cvAUC") h2or.h2o_init(ip=config.hostname, port=config.port, startH2O=False) # Add some base learners with open("{}/R/wrappers.r".format(os.path.dirname(__file__)), "r") as f: ro.r("\n".join(f.readlines())) keep_frames = re.compile("|".join([ training_frame.frame_id, validation_frame.frame_id, test_frame.frame_id ]) + "|.*\\.hex|py_.*") for p in params: row = [ config.cluster, config.nthreads, name, subset_coef, self.name, str(p) ] # Initialize the model init_time = dt.now() # get frame names # load it in R train = h2or.h2o_getFrame(training_frame.frame_id) valid = h2or.h2o_getFrame(validation_frame.frame_id) test = h2or.h2o_getFrame(test_frame.frame_id) init_time = dt.now() - init_time # Train the model train_time = dt.now() if p == "default": model = h2o_ensemble.h2o_ensemble(x=toR(x), y=y, training_frame=train, validation_frame=valid) else: p = {k: toR(v) for k, v in p.items()} model = h2o_ensemble.h2o_ensemble(x=toR(x), y=y, training_frame=train, validation_frame=valid, **p) train_time = dt.now() - train_time # Model metrics metrics_time = dt.now() RpredTrain = stats.predict(model, train) RpredValid = stats.predict(model, valid) RpredTest = stats.predict(model, test) predTrain = h2o.get_frame( h2or.h2o_getId(RpredTrain.rx2("pred"))[0]) predValid = h2o.get_frame( h2or.h2o_getId(RpredValid.rx2("pred"))[0]) predTest = h2o.get_frame(h2or.h2o_getId(RpredTest.rx2("pred"))[0]) metrics_time = dt.now() - metrics_time row.append(init_time.total_seconds()) row.append(train_time.total_seconds()) row.append(metrics_time.total_seconds()) row.append((init_time + train_time + metrics_time).total_seconds()) datasets = [(RpredTrain, predTrain, train, training_frame), (RpredValid, predValid, valid, validation_frame), (RpredTest, predTest, test, test_frame)] append = row.append for pred_r_ptr, pred_py_ptr, data_r_ptr, data_py_ptr in datasets: acc = None err = None mse = ((pred_py_ptr - data_py_ptr[y])**2).mean()[0] if training_frame[y].isfactor()[0]: acc = (pred_py_ptr == data_py_ptr[y]).mean()[0] err = 1.0 - acc auc = cvauc.AUC( base.attr(pred_r_ptr.rx2("pred"), "data")[2], base.attr(data_r_ptr, "data").rx2(y))[0] # TODO: Add more metrics append(acc) append(err) append(None) # F1() append(None) # fnr() append(None) # fpr() append(None) # tnr() append(None) # tpr() append(None) # precision() append(None) # recall() append(None) # sensitivity() append(None) # specificity() append(None) # aic() append((auc)) # auc() append(None) # logloss() append(None) # mean_residual_deviance() append(mse) # mse() append(None) # null_degrees_of_freedom() append(None) # null_deviance() append(None) # r2() append(None) # residual_degrees_of_freedom() append(None) # residual_deviance() h2o.remove(pred_py_ptr) row = map( lambda x: None if isinstance(x, numbers.Number) and (x is None or np.isnan(x)) or x == u"NaN" or x == "NaN" else x, row) persist(row) results.append(row) for [frame] in h2o.ls().as_matrix(): if not keep_frames.match(frame): h2o.remove(frame) df = pd.DataFrame(results, columns=config.Names) return df
def glrm_unitonesparse(): m = 1000 n = 100 k = 10 print "Uploading random uniform matrix with rows = " + str( m) + " and cols = " + str(n) Y = np.random.rand(k, n) def ind_list(k): tmp = [0] * k tmp[np.random.randint(0, k)] = 1 return tmp X = [ind_list(k) for x in xrange(m)] X = np.array(X) train = np.dot(X, Y) train_h2o = h2o.H2OFrame(zip(*train.tolist())) print "Run GLRM with unit one-sparse regularization on X" initial_y = np.random.rand(k, n) initial_y_h2o = h2o.H2OFrame(zip(*initial_y.tolist())) glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o) # glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.show() print "Check that X matrix consists of rows of basis vectors" fit_x = h2o.get_frame( glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) def is_basis(a): zeros = np.where(a == 0)[0].size ones = np.where(a == 1)[0].size basis = ones == 1 and (zeros + ones) == k assert basis, "Got " + str(ones) + " ones and " + str( zeros) + " zeros, but expected all zeros except a single 1" return basis np.apply_along_axis(is_basis, 1, fit_x_np) print "Check final objective function value" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose( pred_np, fit_xy ), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str( glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str( glrm_caterr) + " but should be zero"
regularization_y = "None", max_iterations = 1000, min_step_size = 1e-6) model_c1.train(training_frame=dfh2o_c1) model_c1.show() # Print importance of each component of GLRM model model_c1._model_json["output"]["importance"] # Split the feature matrix into product of two matrices X and Y # The matrix X has the same number of rows as the original feature matrix # but a reduced number of columns representing the original features # GLRM matrix factors X and Y #X_matrix and Y_matrix for cluster1... X_matrix_c1 = h2o.get_frame(model_c1._model_json["output"]["representation_name"]) print(X_matrix_c1) Y_matrix_c1 = model_c1._model_json["output"]["archetypes"] print(Y_matrix_c1) #model genearted for cluster2 :age and limit balance model_c2 = H2OGeneralizedLowRankEstimator(k = 1, loss = "Absolute", multi_loss = "Categorical", transform = "Standardize", regularization_x = "None", regularization_y = "None", max_iterations = 1000, min_step_size = 1e-6) model_c2.train(training_frame=dfh2o_c2) model_c2.show()