def glrm_cancar(): print("Importing cancar.csv data...") cancarH2O = h2o.upload_file( pyunit_utils.locate("smalldata/glrm_test/cancar.csv")) cancarH2O.describe() print("Building GLRM model with init = PlusPlus:\n") glrm_pp = h2o.glrm(x=cancarH2O, k=4, transform="NONE", init="PlusPlus", loss="Quadratic", regularization_x="None", regularization_y="None", max_iterations=1000) glrm_pp.show() print("Building GLRM model with init = SVD:\n") glrm_svd = h2o.glrm(x=cancarH2O, k=4, transform="NONE", init="SVD", loss="Quadratic", regularization_x="None", regularization_y="None", max_iterations=1000) glrm_svd.show()
def glrm_cancar(): print "Importing cancar.csv data..." cancarH2O = h2o.upload_file(tests.locate("smalldata/glrm_test/cancar.csv")) cancarH2O.describe() print "Building GLRM model with init = PlusPlus:\n" glrm_pp = h2o.glrm(x=cancarH2O, k=4, transform="NONE", init="PlusPlus", loss="Quadratic", regularization_x="None", regularization_y="None", max_iterations=1000) glrm_pp.show() print "Building GLRM model with init = SVD:\n" glrm_svd = h2o.glrm(x=cancarH2O, k=4, transform="NONE", init="SVD", loss="Quadratic", regularization_x="None", regularization_y="None", max_iterations=1000) glrm_svd.show()
def glrm_arrests(): print "Importing USArrests.csv data..." arrestsH2O = h2o.upload_file(h2o.locate("smalldata/pca_test/USArrests.csv")) arrestsH2O.describe() print "H2O initial Y matrix:\n" initial_y = [ [5.412, 65.24, -7.54, -0.032], [2.212, 92.24, -17.54, 23.268], [0.312, 123.24, 14.46, 9.768], [1.012, 19.24, -15.54, -1.732], ] initial_y_h2o = h2o.H2OFrame(initial_y) initial_y_h2o.show() print "H2O GLRM on de-meaned data with quadratic loss:\n" glrm_h2o = h2o.glrm( x=arrestsH2O, k=4, transform="DEMEAN", loss="Quadratic", gamma_x=0, gamma_y=0, init="User", user_points=initial_y_h2o, recover_svd=True, ) glrm_h2o.show()
def glrm_iris(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file( pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() for trans in ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"]: rank = random.randint(1, 7) gx = random.uniform(0, 1) gy = random.uniform(0, 1) print("H2O GLRM with rank k = " + str(rank) + ", gamma_x = " + str(gx) + ", gamma_y = " + str(gy) + ", transform = " + trans) glrm_h2o = h2o.glrm(x=irisH2O, k=rank, loss="Quadratic", gamma_x=gx, gamma_y=gy, transform=trans) glrm_h2o.show() print("Impute original data from XY decomposition") pred_h2o = glrm_h2o.predict(irisH2O) pred_h2o.describe() h2o.remove(glrm_h2o._model_json['output']['representation_name'])
def glrm_iris(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() for trans in ["NONE", "DEMEAN", "DESCALE", "STANDARDIZE"]: rank = random.randint(1, 7) gx = random.uniform(0, 1) gy = random.uniform(0, 1) print( "H2O GLRM with rank k = " + str(rank) + ", gamma_x = " + str(gx) + ", gamma_y = " + str(gy) + ", transform = " + trans ) glrm_h2o = h2o.glrm(x=irisH2O, k=rank, loss="Quadratic", gamma_x=gx, gamma_y=gy, transform=trans) glrm_h2o.show() print("Impute original data from XY decomposition") pred_h2o = glrm_h2o.predict(irisH2O) pred_h2o.describe() h2o.remove(glrm_h2o._model_json["output"]["representation_name"])
def glrm_set_loss_by_col(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsPy = np.array(h2o.as_list(arrestsH2O)) arrestsH2O.describe() print("H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber") glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute","Huber"], loss_by_col_idx=[0,3], regularization_x="None", regularization_y="None") glrm_h2o.show() fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) print("Check final objective function value") fit_xy = np.dot(fit_x_np, fit_y_np) fit_diff = arrestsPy.__sub__(fit_xy) obj_val = np.absolute(fit_diff[:,0]) + np.square(fit_diff[:,1]) + np.square(fit_diff[:,2]) def huber(a): return a*a/2 if abs(a) <= 1 else abs(a)-0.5 huber = np.vectorize(huber) obj_val = obj_val + huber(fit_diff[:,3]) obj_val = np.sum(obj_val) glrm_obj = glrm_h2o._model_json['output']['objective'] assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(obj_val)
def glrm_benign(): print "Importing benign.csv data..." benignH2O = h2o.upload_file(tests.locate("smalldata/logreg/benign.csv")) benignH2O.describe() for i in range(8, 16, 2): print "H2O GLRM with rank " + str(i) + " decomposition:\n" glrm_h2o = h2o.glrm(x=benignH2O, k=i, init="SVD", recover_svd=True) glrm_h2o.show()
def glrm_benign(): print "Importing benign.csv data..." benignH2O = h2o.upload_file(tests.locate("smalldata/logreg/benign.csv")) benignH2O.describe() for i in range(8,16,2): print "H2O GLRM with rank " + str(i) + " decomposition:\n" glrm_h2o = h2o.glrm(x=benignH2O, k=i, init="SVD", recover_svd=True) glrm_h2o.show()
def glrm_nnmf(): m = 1000 n = 100 k = 10 print "Uploading random uniform matrix with rows = " + str( m) + " and cols = " + str(n) Y = np.random.rand(k, n) X = np.random.rand(m, k) train = np.dot(X, Y) train_h2o = h2o.H2OFrame(train.tolist()) print "Run GLRM with non-negative regularization" initial_y = np.random.rand(k, n) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="NonNegative", regularization_y="NonNegative", gamma_x=1, gamma_y=1) glrm_h2o.show() print "Check that X and Y matrices are non-negative" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame( glrm_h2o._model_json['output']['loading_key']['name']) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print "Check final objective function value" fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose( pred_np, fit_xy ), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str( glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str( glrm_caterr) + " but should be zero"
def glrm_unitonesparse(): m = 1000 n = 100 k = 10 print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n) Y = np.random.rand(k,n) def ind_list(k): tmp = [0] * k tmp[np.random.randint(0,k)] = 1 return tmp X = [ind_list(k) for x in xrange(m)] X = np.array(X) train = np.dot(X,Y) train_h2o = h2o.H2OFrame(zip(*train.tolist())) print "Run GLRM with unit one-sparse regularization on X" initial_y = np.random.rand(k,n) initial_y_h2o = h2o.H2OFrame(zip(*initial_y.tolist())) glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.show() print "Check that X matrix consists of rows of basis vectors" fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) def is_basis(a): zeros = np.where(a == 0)[0].size ones = np.where(a == 1)[0].size basis = ones == 1 and (zeros + ones) == k assert basis, "Got " + str(ones) + " ones and " + str(zeros) + " zeros, but expected all zeros except a single 1" return basis np.apply_along_axis(is_basis, 1, fit_x_np) print "Check final objective function value" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def glrm_simplex(): m = 1000 n = 100 k = 10 print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n) Y = np.random.rand(k,n) def ind_list(k): tmp = [0] * k tmp[np.random.randint(0,k)] = 1 return tmp X = [ind_list(k) for x in xrange(m)] X = np.array(X) train = np.dot(X,Y) train_h2o = h2o.H2OFrame(train.tolist()) print "Run GLRM with quadratic mixtures (simplex) regularization on X" initial_y = np.random.rand(k,n) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.show() print "Check that X matrix consists of rows within standard probability simplex" fit_x = h2o.get_frame(glrm_h2o._model_json['output']['loading_key']['name']) fit_x_np = np.array(h2o.as_list(fit_x)) def is_simplex(a): row_sum = sum(a) simplex = abs(row_sum - 1) < 1e-6 assert simplex, "Got sum over row = " + row_sum + ", but expected 1" return simplex np.apply_along_axis(is_simplex, 1, fit_x_np) print "Check final objective function value" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def glrm_prostate_miss(): missing_ratios = np.arange(0.1, 1, 0.1).tolist() print("Importing prostate_cat.csv data and saving for validation...") prostate_full = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA"]*8) prostate_full.describe() totnas = 0 for i in range(prostate_full.ncol): totnas = totnas + prostate_full[i].isna().sum() totobs = prostate_full.nrow * prostate_full.ncol - totnas train_numerr = [0]*len(missing_ratios) valid_numerr = [0]*len(missing_ratios) train_caterr = [0]*len(missing_ratios) valid_caterr = [0]*len(missing_ratios) for i in range(len(missing_ratios)): ratio = missing_ratios[i] print("Importing prostate_cat.csv and inserting {0}% missing entries".format(100*ratio)) prostate_miss = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) prostate_miss = prostate_miss.insert_missing_values(fraction=ratio) prostate_miss.describe() print("H2O GLRM with {0}% missing entries".format(100*ratio)) prostate_glrm = h2o.glrm(x=prostate_miss, validation_frame=prostate_full, k=8, ignore_const_cols=False, loss="Quadratic", gamma_x=0.5, gamma_y=0.5, regularization_x="L1", regularization_y="L1", init="SVD", max_iterations=2000, min_step_size=1e-6) prostate_glrm.show() # Check imputed data and error metrics train_numcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['numcnt'] valid_numcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numcnt'] train_catcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['catcnt'] valid_catcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['catcnt'] assert valid_numcnt >= train_numcnt, "Number of non-missing numeric entries in training data should be less than or equal to validation data" assert valid_catcnt >= train_catcnt, "Number of non-missing categorical entries in training data should be less than or equal to validation data" assert (train_numcnt + valid_numcnt) < totobs, "Total non-missing numeric entries in training and validation data was {0}, but should be less than {1}".format(train_numcnt + valid_numcnt, totobs) assert (valid_numcnt + valid_catcnt) == totobs, "Number of non-missing entries in validation data was {0}, but should be {1}".format(valid_numcnt + valid_catcnt, totobs) train_numerr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['numerr'] valid_numerr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numerr'] train_caterr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['caterr'] valid_caterr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['caterr'] h2o.remove(prostate_glrm._model_json['output']['representation_name']) for i in range(len(missing_ratios)): print("Missing ratio: {0}% --> Training numeric error: {1}\tValidation numeric error: {2}".format(missing_ratios[i]*100, train_numerr[i], valid_numerr[i])) for i in range(len(missing_ratios)): print("Missing ratio: {0}% --> Training categorical error: {1}\tValidation categorical error: {2}".format(missing_ratios[i]*100, train_caterr[i], valid_caterr[i]))
def glrm_prostate_miss(): missing_ratios = np.arange(0.1, 1, 0.1).tolist() print "Importing prostate_cat.csv data and saving for validation..." prostate_full = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA"]*8) prostate_full.describe() totnas = 0 for i in range(prostate_full.ncol): totnas = totnas + prostate_full[i].isna().sum() totobs = prostate_full.nrow * prostate_full.ncol - totnas train_numerr = [0]*len(missing_ratios) valid_numerr = [0]*len(missing_ratios) train_caterr = [0]*len(missing_ratios) valid_caterr = [0]*len(missing_ratios) for i in range(len(missing_ratios)): ratio = missing_ratios[i] print "Importing prostate_cat.csv and inserting {0}% missing entries".format(100*ratio) prostate_miss = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) prostate_miss = prostate_miss.insert_missing_values(fraction=ratio) prostate_miss.describe() print "H2O GLRM with {0}% missing entries".format(100*ratio) prostate_glrm = h2o.glrm(x=prostate_miss, validation_frame=prostate_full, k=8, ignore_const_cols=False, loss="Quadratic", gamma_x=0.5, gamma_y=0.5, regularization_x="L1", regularization_y="L1", init="SVD", max_iterations=2000, min_step_size=1e-6) prostate_glrm.show() # Check imputed data and error metrics train_numcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['numcnt'] valid_numcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numcnt'] train_catcnt = prostate_glrm._model_json['output']['training_metrics']._metric_json['catcnt'] valid_catcnt = prostate_glrm._model_json['output']['validation_metrics']._metric_json['catcnt'] assert valid_numcnt >= train_numcnt, "Number of non-missing numeric entries in training data should be less than or equal to validation data" assert valid_catcnt >= train_catcnt, "Number of non-missing categorical entries in training data should be less than or equal to validation data" assert (train_numcnt + valid_numcnt) < totobs, "Total non-missing numeric entries in training and validation data was {0}, but should be less than {1}".format(train_numcnt + valid_numcnt, totobs) assert (valid_numcnt + valid_catcnt) == totobs, "Number of non-missing entries in validation data was {0}, but should be {1}".format(valid_numcnt + valid_catcnt, totobs) train_numerr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['numerr'] valid_numerr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['numerr'] train_caterr[i] = prostate_glrm._model_json['output']['training_metrics']._metric_json['caterr'] valid_caterr[i] = prostate_glrm._model_json['output']['validation_metrics']._metric_json['caterr'] h2o.remove(prostate_glrm._model_json['output']['representation_name']) for i in range(len(missing_ratios)): print "Missing ratio: {0}% --> Training numeric error: {1}\tValidation numeric error: {2}".format(missing_ratios[i]*100, train_numerr[i], valid_numerr[i]) for i in range(len(missing_ratios)): print "Missing ratio: {0}% --> Training categorical error: {1}\tValidation categorical error: {2}".format(missing_ratios[i]*100, train_caterr[i], valid_caterr[i])
def glrm_arrests(): print "Importing USArrests.csv data..." arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsH2O.describe() print "H2O initial Y matrix:\n" initial_y = [[5.412, 65.24, -7.54, -0.032], [2.212, 92.24, -17.54, 23.268], [0.312, 123.24, 14.46, 9.768], [1.012, 19.24, -15.54, -1.732]] initial_y_h2o = h2o.H2OFrame(initial_y) initial_y_h2o.show() print "H2O GLRM on de-meaned data with quadratic loss:\n" glrm_h2o = h2o.glrm(x=arrestsH2O, k=4, transform="DEMEAN", loss="Quadratic", gamma_x=0, gamma_y=0, init="User", user_y=initial_y_h2o, recover_svd=True) glrm_h2o.show()
def glrm_set_loss_by_col(): print "Importing USArrests.csv data..." arrestsH2O = h2o.upload_file(h2o.locate("smalldata/pca_test/USArrests.csv")) arrestsH2O.describe() print "H2O GLRM with loss by column = L1, Quadratic, Quadratic, Huber" glrm_h2o = h2o.glrm( x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["L1", "Huber"], loss_by_col_idx=[0, 3], regularization_x="None", regularization_y="None", ) glrm_h2o.show()
def glrm_arrests_miss(): missing_ratios = np.arange(0.1, 1, 0.1).tolist() print "Importing USArrests.csv data and saving for validation..." arrests_full = h2o.upload_file(h2o.locate("smalldata/pca_test/USArrests.csv")) arrests_full.describe() totobs = arrests_full.nrow * arrests_full.ncol train_err = [0]*len(missing_ratios) valid_err = [0]*len(missing_ratios) for i in range(len(missing_ratios)): ratio = missing_ratios[i] print "Importing USArrests.csv and inserting {0}% missing entries".format(100*ratio) arrests_miss = h2o.upload_file(h2o.locate("smalldata/pca_test/USArrests.csv")) arrests_miss = arrests_miss.insert_missing_values(fraction=ratio) arrests_miss.describe() print "H2O GLRM with {0}% missing entries".format(100*ratio) arrests_glrm = h2o.glrm(x=arrests_miss, validation_frame=arrests_full, k=4, ignore_const_cols=False, loss="Quadratic", regularization_x="None", regularization_y="None", init="PlusPlus", max_iterations=10, min_step_size=1e-6) arrests_glrm.show() # Check imputed data and error metrics glrm_obj = arrests_glrm._model_json['output']['objective'] train_numerr = arrests_glrm._model_json['output']['training_metrics']._metric_json['numerr'] train_caterr = arrests_glrm._model_json['output']['training_metrics']._metric_json['caterr'] valid_numerr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numerr'] valid_caterr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['caterr'] assert abs(train_numerr - glrm_obj) < 1e-3, "Numeric error on training data was " + str(train_numerr) + " but should equal final objective " + str(glrm_obj) assert train_caterr == 0, "Categorical error on training data was " + str(train_caterr) + " but should be zero" assert valid_caterr == 0, "Categorical error on validation data was " + str(valid_caterr) + " but should be zero" train_numcnt = arrests_glrm._model_json['output']['training_metrics']._metric_json['numcnt'] valid_numcnt = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numcnt'] assert valid_numcnt > train_numcnt, "Number of non-missing numerical entries in training data should be less than validation data" assert valid_numcnt == totobs, "Number of non-missing numerical entries in validation data was " + str(valid_numcnt) + " but should be " + str(totobs) train_err[i] = train_numerr valid_err[i] = valid_numerr h2o.remove(arrests_glrm._model_json['output']['loading_key']['name']) for i in range(len(missing_ratios)): print "Missing ratio: {0}% --> Training error: {1}\tValidation error: {2}".format(missing_ratios[i]*100, train_err[i], valid_err[i])
def glrm_arrests_miss(): missing_ratios = np.arange(0.1, 1, 0.1).tolist() print("Importing USArrests.csv data and saving for validation...") arrests_full = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrests_full.describe() totobs = arrests_full.nrow * arrests_full.ncol train_err = [0]*len(missing_ratios) valid_err = [0]*len(missing_ratios) for i in range(len(missing_ratios)): ratio = missing_ratios[i] print("Importing USArrests.csv and inserting {0}% missing entries".format(100*ratio)) arrests_miss = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrests_miss = arrests_miss.insert_missing_values(fraction=ratio) arrests_miss.describe() print("H2O GLRM with {0}% missing entries".format(100*ratio)) arrests_glrm = h2o.glrm(x=arrests_miss, validation_frame=arrests_full, k=4, ignore_const_cols=False, loss="Quadratic", regularization_x="None", regularization_y="None", init="PlusPlus", max_iterations=10, min_step_size=1e-6) arrests_glrm.show() # Check imputed data and error metrics glrm_obj = arrests_glrm._model_json['output']['objective'] train_numerr = arrests_glrm._model_json['output']['training_metrics']._metric_json['numerr'] train_caterr = arrests_glrm._model_json['output']['training_metrics']._metric_json['caterr'] valid_numerr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numerr'] valid_caterr = arrests_glrm._model_json['output']['validation_metrics']._metric_json['caterr'] assert abs(train_numerr - glrm_obj) < 1e-3, "Numeric error on training data was " + str(train_numerr) + " but should equal final objective " + str(glrm_obj) assert train_caterr == 0, "Categorical error on training data was " + str(train_caterr) + " but should be zero" assert valid_caterr == 0, "Categorical error on validation data was " + str(valid_caterr) + " but should be zero" train_numcnt = arrests_glrm._model_json['output']['training_metrics']._metric_json['numcnt'] valid_numcnt = arrests_glrm._model_json['output']['validation_metrics']._metric_json['numcnt'] assert valid_numcnt > train_numcnt, "Number of non-missing numerical entries in training data should be less than validation data" assert valid_numcnt == totobs, "Number of non-missing numerical entries in validation data was " + str(valid_numcnt) + " but should be " + str(totobs) train_err[i] = train_numerr valid_err[i] = valid_numerr h2o.remove(arrests_glrm._model_json['output']['representation_name']) for i in range(len(missing_ratios)): print("Missing ratio: {0}% --> Training error: {1}\tValidation error: {2}".format(missing_ratios[i]*100, train_err[i], valid_err[i]))
def glrm_set_loss_by_col(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsPy = np.array(h2o.as_list(arrestsH2O)) arrestsH2O.describe() print( "H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber") glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute", "Huber"], loss_by_col_idx=[0, 3], regularization_x="None", regularization_y="None") glrm_h2o.show() fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame( glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) print("Check final objective function value") fit_xy = np.dot(fit_x_np, fit_y_np) fit_diff = arrestsPy.__sub__(fit_xy) obj_val = np.absolute(fit_diff[:, 0]) + np.square( fit_diff[:, 1]) + np.square(fit_diff[:, 2]) def huber(a): return a * a / 2 if abs(a) <= 1 else abs(a) - 0.5 huber = np.vectorize(huber) obj_val = obj_val + huber(fit_diff[:, 3]) obj_val = np.sum(obj_val) glrm_obj = glrm_h2o._model_json['output']['objective'] assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(obj_val)
def glrm_nnmf(): m = 1000 n = 100 k = 10 print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)) Y = np.random.rand(k,n) X = np.random.rand(m, k) train = np.dot(X,Y) train_h2o = h2o.H2OFrame(list(zip(*train.tolist()))) print("Run GLRM with non-negative regularization") initial_y = np.random.rand(n,k) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="NonNegative", regularization_y="NonNegative", gamma_x=1, gamma_y=1) glrm_h2o.show() print("Check that X and Y matrices are non-negative") fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print("Check final objective function value") fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print("Impute XY and check error metrics") pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def glrm_orthonnmf(): m = 1000 n = 100 k = 10 print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)) Y = np.random.rand(k, n) X = np.random.rand(m, k) train = np.dot(X, Y) train_h2o = h2o.H2OFrame(train.tolist()) print( "Run GLRM with orthogonal non-negative regularization on X, non-negative regularization on Y" ) initial_y = np.random.rand(k, n) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="OneSparse", regularization_y="NonNegative", gamma_x=1, gamma_y=1) glrm_h2o.show() print("Check that X and Y matrices are non-negative") fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame( glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print("Check that columns of X are orthogonal") xtx = np.dot(np.transpose(fit_x_np), fit_x_np) offdiag = np.extract(1 - np.eye(k), xtx) assert np.all( offdiag == 0), "All off diagonal elements of X'X must equal zero" print("Check final objective function value") fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(sse) print("Impute XY and check error metrics") pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose( pred_np, fit_xy ), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str( glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str( glrm_caterr) + " but should be zero" print( "Run GLRM with orthogonal non-negative regularization on both X and Y") initial_y = np.random.rand(k, n) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="OneSparse", regularization_y="OneSparse", gamma_x=1, gamma_y=1) glrm_h2o.show() print("Check that X and Y matrices are non-negative") fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame( glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print("Check that columns of X are orthogonal") xtx = np.dot(np.transpose(fit_x_np), fit_x_np) offdiag_x = np.extract(1 - np.eye(k), xtx) assert np.all( offdiag_x == 0), "All off diagonal elements of X'X must equal zero" print("Check that rows of Y are orthogonal") yyt = np.dot(fit_y_np, np.transpose(fit_y_np)) offdiag_y = np.extract(1 - np.eye(k), yyt) assert np.all( offdiag_y == 0), "All off diagonal elements of YY' must equal zero" print("Check final objective function value") fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(sse) print("Impute XY and check error metrics") pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose( pred_np, fit_xy ), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str( glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str( glrm_caterr) + " but should be zero"
def glrm_set_loss_by_col_rand(): NUM_LOSS = ["Quadratic", "Absolute", "Huber", "Poisson", "Periodic"] CAT_LOSS = ["Categorical", "Ordinal"] NUM_COLS = [1, 5, 6, 7] CAT_COLS = [0, 2, 3, 4] print "Importing prostate_cat.csv data..." prostateH2O = h2o.upload_file( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA"] * 8) prostateH2O.describe() # Fully specify every column's loss function (no need for loss_by_col_idx) loss_all = [ rd.sample(NUM_LOSS, k=1)[0] if x in NUM_COLS else rd.sample(CAT_LOSS, k=1)[0] for x in xrange(0, 8) ] print "Run GLRM with loss_by_col = [" + ', '.join(loss_all) + "]" glrm_h2o = h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all) glrm_h2o.show() # Randomly set columns and loss functions cat_size = rd.sample(xrange(1, 5), 1) num_size = rd.sample(xrange(1, 5), 1) cat_idx = np.random.choice(CAT_COLS, size=cat_size, replace=False) num_idx = np.random.choice(NUM_COLS, size=num_size, replace=False) loss_by_col_cat = np.random.choice(CAT_LOSS, size=cat_size, replace=True) loss_by_col_num = np.random.choice(NUM_LOSS, size=num_size, replace=True) loss_idx_all = cat_idx.tolist() + num_idx.tolist() loss_all = loss_by_col_cat.tolist() + loss_by_col_num.tolist() loss_combined = zip( loss_all, loss_idx_all) # Permute losses and indices in same way for testing rd.shuffle(loss_combined) loss_all[:], loss_idx_all[:] = zip(*loss_combined) if (len(loss_all) < prostateH2O.ncol): try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all) assert False, "Expected GLRM to throw error since column indices not specified" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col_idx=loss_idx_all) assert False, "Expected GLRM to throw error since losses for columns not specified" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=["Absolute", "Ordinal", "Huber"], loss_by_col_idx=[1, 2]) assert False, "Expected GLRM to throw error since not all column indices specified" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=["Absolute", "Ordinal"], loss_by_col_idx=[1, 2, 5]) assert False, "Expected GLRM to throw error since not all losses for columns specified" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col="Absolute", loss_by_col_idx=8) assert False, "Expected GLRM to throw error since column index 8 is out of bounds (zero indexing)" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=rd.sample(NUM_LOSS, 1), loss_by_col_idx=rd.sample(CAT_COLS, 1)) assert False, "Expected GLRM to throw error since numeric loss cannot apply to categorical column" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=rd.sample(CAT_LOSS, 1), loss_by_col_idx=rd.sample(NUM_COLS, 1)) assert False, "Expected GLRM to throw error since categorical loss cannot apply to numeric column" except: pass print "Run GLRM with loss_by_col = [" + ', '.join( loss_all) + "] and loss_by_col_idx = [" + ', '.join( [str(a) for a in loss_idx_all]) + "]" glrm_h2o = h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all, loss_by_col_idx=loss_idx_all) glrm_h2o.show()
def glrm_set_loss_by_col_rand(): NUM_LOSS = ["Quadratic", "Absolute", "Huber", "Poisson", "Periodic"] CAT_LOSS = ["Categorical", "Ordinal"] NUM_COLS = [1, 5, 6, 7] CAT_COLS = [0, 2, 3, 4] print("Importing prostate_cat.csv data...") prostateH2O = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings = ["NA"]*8) prostateH2O.describe() # Fully specify every column's loss function (no need for loss_by_col_idx) loss_all = [rd.sample(NUM_LOSS, k=1)[0] if x in NUM_COLS else rd.sample(CAT_LOSS, k=1)[0] for x in range(0,8)] print("Run GLRM with loss_by_col = [" + ', '.join(loss_all) + "]") glrm_h2o = h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all) glrm_h2o.show() # Randomly set columns and loss functions cat_size = rd.sample(range(1,5), 1) num_size = rd.sample(range(1,5), 1) cat_idx = np.random.choice(CAT_COLS, size=cat_size, replace=False) num_idx = np.random.choice(NUM_COLS, size=num_size, replace=False) loss_by_col_cat = np.random.choice(CAT_LOSS, size=cat_size, replace=True) loss_by_col_num = np.random.choice(NUM_LOSS, size=num_size, replace=True) loss_idx_all = cat_idx.tolist() + num_idx.tolist() loss_all = loss_by_col_cat.tolist() + loss_by_col_num.tolist() loss_combined = list(zip(loss_all, loss_idx_all)) # Permute losses and indices in same way for testing rd.shuffle(loss_combined) loss_all[:], loss_idx_all[:] = list(zip(*loss_combined)) if(len(loss_all) < prostateH2O.ncol): try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all) assert False, "Expected GLRM to throw error since column indices not specified" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col_idx=loss_idx_all) assert False, "Expected GLRM to throw error since losses for columns not specified" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=["Absolute", "Ordinal", "Huber"], loss_by_col_idx = [1,2]) assert False, "Expected GLRM to throw error since not all column indices specified" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=["Absolute", "Ordinal"], loss_by_col_idx=[1,2,5]) assert False, "Expected GLRM to throw error since not all losses for columns specified" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col="Absolute", loss_by_col_idx=8) assert False, "Expected GLRM to throw error since column index 8 is out of bounds (zero indexing)" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=rd.sample(NUM_LOSS,1), loss_by_col_idx=rd.sample(CAT_COLS,1)) assert False, "Expected GLRM to throw error since numeric loss cannot apply to categorical column" except: pass try: h2o.glrm(x=prostateH2O, k=5, loss_by_col=rd.sample(CAT_LOSS,1), loss_by_col_idx=rd.sample(NUM_COLS,1)) assert False, "Expected GLRM to throw error since categorical loss cannot apply to numeric column" except: pass print("Run GLRM with loss_by_col = [" + ', '.join(loss_all) + "] and loss_by_col_idx = [" + ', '.join([str(a) for a in loss_idx_all]) + "]") glrm_h2o = h2o.glrm(x=prostateH2O, k=5, loss_by_col=loss_all, loss_by_col_idx=loss_idx_all) glrm_h2o.show()
def glrm_simplex(): m = 1000 n = 100 k = 10 print "Uploading random uniform matrix with rows = " + str( m) + " and cols = " + str(n) Y = np.random.rand(k, n) def ind_list(k): tmp = [0] * k tmp[np.random.randint(0, k)] = 1 return tmp X = [ind_list(k) for x in xrange(m)] X = np.array(X) train = np.dot(X, Y) train_h2o = h2o.H2OFrame(train.tolist()) print "Run GLRM with quadratic mixtures (simplex) regularization on X" initial_y = np.random.rand(k, n) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.show() print "Check that X matrix consists of rows within standard probability simplex" fit_x = h2o.get_frame( glrm_h2o._model_json['output']['loading_key']['name']) fit_x_np = np.array(h2o.as_list(fit_x)) def is_simplex(a): row_sum = sum(a) simplex = abs(row_sum - 1) < 1e-6 assert simplex, "Got sum over row = " + row_sum + ", but expected 1" return simplex np.apply_along_axis(is_simplex, 1, fit_x_np) print "Check final objective function value" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose( pred_np, fit_xy ), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str( glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str( glrm_caterr) + " but should be zero"
def glrm_orthonnmf(): m = 1000 n = 100 k = 10 print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n) Y = np.random.rand(k,n) X = np.random.rand(m, k) train = np.dot(X,Y) train_h2o = h2o.H2OFrame(train.tolist()) print "Run GLRM with orthogonal non-negative regularization on X, non-negative regularization on Y" initial_y = np.random.rand(k,n) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="OneSparse", regularization_y="NonNegative", gamma_x=1, gamma_y=1) glrm_h2o.show() print "Check that X and Y matrices are non-negative" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame(glrm_h2o._model_json['output']['loading_key']['name']) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print "Check that columns of X are orthogonal" xtx = np.dot(np.transpose(fit_x_np), fit_x_np) offdiag = np.extract(1-np.eye(k), xtx) assert np.all(offdiag == 0), "All off diagonal elements of X'X must equal zero" print "Check final objective function value" fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero" print "Run GLRM with orthogonal non-negative regularization on both X and Y" initial_y = np.random.rand(k,n) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="OneSparse", regularization_y="OneSparse", gamma_x=1, gamma_y=1) glrm_h2o.show() print "Check that X and Y matrices are non-negative" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame(glrm_h2o._model_json['output']['loading_key']['name']) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print "Check that columns of X are orthogonal" xtx = np.dot(np.transpose(fit_x_np), fit_x_np) offdiag_x = np.extract(1-np.eye(k), xtx) assert np.all(offdiag_x == 0), "All off diagonal elements of X'X must equal zero" print "Check that rows of Y are orthogonal" yyt = np.dot(fit_y_np, np.transpose(fit_y_np)) offdiag_y = np.extract(1-np.eye(k), yyt) assert np.all(offdiag_y == 0), "All off diagonal elements of YY' must equal zero" print "Check final objective function value" fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"