def expr_slicing(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) iris.show() ################################################################### # expr[int] (column slice), expr is pending res = 2 - iris res2 = h2o.as_list(res[0]) assert abs(res2[3][0] - -2.6) < 1e-10 and abs(res2[17][0] - -3.1) < 1e-10 and abs(res2[24][0] - -2.8) < 1e-10, \ "incorrect values" # expr[int,int], expr is remote res.eager() res3 = h2o.as_list(res[13, 3]) assert abs(res3[0][0] - 1.9) < 1e-10, "incorrect values" # expr[int, slice], expr is remote res4 = h2o.as_list(res[12, 0:3]) assert abs(res4[0][0] - -2.8) < 1e-10 and abs(res4[0][1] - -1.0) < 1e-10 and abs(res4[0][2] - 0.6) < 1e-10 and \ abs(res4[0][3] - 1.9) < 1e-10, "incorrect values" # expr[slice, int], expr is remote res5 = h2o.as_list(res[5:8, 1]) assert abs(res5[0][0] - -1.9) < 1e-10 and abs(res5[1][0] - -1.4) < 1e-10 and abs(res5[2][0] - -1.4) < 1e-10 and \ abs(res5[3][0] - -0.9) < 1e-10, "incorrect values" # expr[slice, slice], expr is pending res = iris * 2 res6 = h2o.as_list(res[5:8, 0:3]) assert abs(res6[0][0] - 10.8) < 1e-10 and abs(res6[1][1] - 6.8) < 1e-10 and abs(res6[2][2] - 3.0) < 1e-10 and \ abs(res6[3][3] - 0.4) < 1e-10, "incorrect values"
def expr_as_list(): iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # multiple rows and columns res = 2 - iris res = h2o.as_list(res, use_pandas=False) res = list(zip(*res)) assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[1][5]) - -1.6) < 1e-10 and \ abs(float(res[2][11]) - 0.5) < 1e-10, "incorrect values" # single column res = 2 - iris res = h2o.as_list(res[0], use_pandas=False) res = list(zip(*res)) assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[0][18]) - -3.1) < 1e-10 and \ abs(float(res[0][25]) - -2.8) < 1e-10, "incorrect values" # local data frm = h2o.as_list(h2o.H2OFrame([[1, 2, 3]]), use_pandas=False) assert float(frm[1][2]) == 3, "incorrect values" frm = h2o.as_list(h2o.H2OFrame([[1, 2, 3], [4, 5, 6]]), use_pandas=False) assert float(frm[2][1]) == 5, "incorrect values"
def expr_as_list(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) # multiple rows and columns res = 2 - iris res = h2o.as_list(res, use_pandas=False) assert abs(float(res[4][0]) - -2.6) < 1e-10 and abs(float(res[5][1]) - -1.6) < 1e-10 and \ abs(float(res[11][2]) - 0.5) < 1e-10, "incorrect values" # single column res = 2 - iris res = h2o.as_list(res[0], use_pandas=False) assert abs(float(res[4][0]) - -2.6) < 1e-10 and abs(float(res[18][0]) - -3.1) < 1e-10 and \ abs(float(res[25][0]) - -2.8) < 1e-10, "incorrect values" # local data frm = h2o.as_list(h2o.H2OFrame(python_obj=[1, 2, 3]), use_pandas=False) assert float(frm[1][2]) == 3, "incorrect values" frm = h2o.as_list(h2o.H2OFrame(python_obj=[[1, 2, 3], [4, 5, 6]]), use_pandas=False) assert float(frm[2][1]) == 5, "incorrect values"
def auc(m, v, t): y_true = v[t] y_scores = m.predict(v) y_true = h2o.as_list(y_true, use_pandas=True).values y_scores = h2o.as_list(y_scores, use_pandas=True).values d = roc_auc_score(y_true, y_scores) return d
def frame_slicing(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show() ################################################################### # H2OFrame[int] (column slice) res1 = h2o.as_list(iris[0]) assert abs(res1[8][0] - 4.4) < 1e-10, "incorrect values" # H2OFrame[int,int] res2 = h2o.as_list(prostate[13, 3]) assert abs(res2[0][0] - 1) < 1e-10, "incorrect values" # H2OFrame[int, slice] res3 = h2o.as_list(airlines[12, 0:3]) assert abs(res3[0][0] - 1987) < 1e-10 and abs(res3[0][1] - 10) < 1e-10 and abs(res3[0][2] - 29) < 1e-10, \ "incorrect values" # H2OFrame[slice, int] res4 = h2o.as_list(iris[5:8, 1]) assert abs(res4[0][0] - 3.9) < 1e-10 and abs(res4[1][0] - 3.4) < 1e-10 and abs(res4[2][0] - 3.4) < 1e-10 and \ abs(res4[3][0] - 2.9) < 1e-10, "incorrect values" # H2OFrame[slice, slice] res5 = h2o.as_list(prostate[5:8, 0:3]) assert abs(res5[0][0] - 6) < 1e-10 and abs(res5[1][1] - 0) < 1e-10 and abs(res5[2][2] - 61) < 1e-10, "incorrect values"
def expr_as_list(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) # multiple rows and columns res = 2 - iris res = h2o.as_list(res, use_pandas=False) assert ( abs(float(res[4][0]) - -2.6) < 1e-10 and abs(float(res[5][1]) - -1.6) < 1e-10 and abs(float(res[11][2]) - 0.5) < 1e-10 ), "incorrect values" # single column res = 2 - iris res = h2o.as_list(res[0], use_pandas=False) assert ( abs(float(res[4][0]) - -2.6) < 1e-10 and abs(float(res[18][0]) - -3.1) < 1e-10 and abs(float(res[25][0]) - -2.8) < 1e-10 ), "incorrect values" # local data frm = h2o.as_list(h2o.H2OFrame(python_obj=[1, 2, 3]), use_pandas=False) assert float(frm[1][2]) == 3, "incorrect values" frm = h2o.as_list(h2o.H2OFrame(python_obj=[[1, 2, 3], [4, 5, 6]]), use_pandas=False) assert float(frm[2][1]) == 5, "incorrect values"
def expr_as_list(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # multiple rows and columns res = 2 - iris res = h2o.as_list(res, use_pandas=False) res = list(zip(*res)) assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[1][5]) - -1.6) < 1e-10 and \ abs(float(res[2][11]) - 0.5) < 1e-10, "incorrect values" # single column res = 2 - iris res = h2o.as_list(res[0], use_pandas=False) res = list(zip(*res)) assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[0][18]) - -3.1) < 1e-10 and \ abs(float(res[0][25]) - -2.8) < 1e-10, "incorrect values" # local data frm = h2o.as_list(h2o.H2OFrame([[1,2,3]]), use_pandas=False) assert float(frm[1][2]) == 3, "incorrect values" frm = h2o.as_list(h2o.H2OFrame([[1,2,3], [4,5,6]]), use_pandas=False) assert float(frm[2][1]) == 5, "incorrect values"
def group_by(ip,port): # Connect to a pre-existing cluster h2o.init(ip,port) h2o_iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) pd_iris = pd.read_csv(h2o.locate("smalldata/iris/iris_wheader.csv")) h2o_agg_funcs = ["count","count_unique","first","last","min","max","mean","avg","sd","stdev","var","sum","ss"] na_handling = ["ignore","rm","all"] col_names = h2o_iris.col_names()[0:4] # smoke test for a in h2o_agg_funcs: for n in na_handling: for c in col_names: h2o.group_by(h2o_iris, ["class"], {"foo":[a,c,n]}) # h2o/pandas/numpy comparison test h2o_np_agg_dict = {"min":np.min, "max":np.max, "mean":np.mean, "sum":np.sum} for k in h2o_np_agg_dict.keys(): for c in col_names: h2o_res = h2o.group_by(h2o_iris, ["class"], {"foo":[k,c,"all"]}) pd_res = pd_iris.groupby("class")[c].aggregate(h2o_np_agg_dict[k]) for i in range(3): h2o_val = h2o.as_list(h2o_res)[i][1] pd_val = pd_res.values[int(h2o.as_list(h2o_res)[i][0])] assert abs(h2o_val - pd_val) < 1e-06, \ "check unsuccessful! h2o computed {0} and pandas computed {1}. expected equal aggregate {2} values between h2o and pandas on column {3}".format(h2o_val,pd_val,k,c)
def frame_as_list(): iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_file(path=tests.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_file(path=tests.locate("smalldata/airlines/allyears2k.zip")) res1 = h2o.as_list(iris, use_pandas=False) assert ( abs(float(res1[9][0]) - 4.4) < 1e-10 and abs(float(res1[9][1]) - 2.9) < 1e-10 and abs(float(res1[9][2]) - 1.4) < 1e-10 ), "incorrect values" res2 = h2o.as_list(prostate, use_pandas=False) assert ( abs(float(res2[7][0]) - 7) < 1e-10 and abs(float(res2[7][1]) - 0) < 1e-10 and abs(float(res2[7][2]) - 68) < 1e-10 ), "incorrect values" res3 = h2o.as_list(airlines, use_pandas=False) assert ( abs(float(res3[4][0]) - 1987) < 1e-10 and abs(float(res3[4][1]) - 10) < 1e-10 and abs(float(res3[4][2]) - 18) < 1e-10 ), "incorrect values"
def glrm_set_loss_by_col(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) arrestsPy = np.array(h2o.as_list(arrestsH2O)) arrestsH2O.describe() print("H2O GLRM with loss by column = Absolute, Quadratic, Quadratic, Huber") glrm_h2o = h2o.glrm(x=arrestsH2O, k=3, loss="Quadratic", loss_by_col=["Absolute","Huber"], loss_by_col_idx=[0,3], regularization_x="None", regularization_y="None") glrm_h2o.show() fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) print("Check final objective function value") fit_xy = np.dot(fit_x_np, fit_y_np) fit_diff = arrestsPy.__sub__(fit_xy) obj_val = np.absolute(fit_diff[:,0]) + np.square(fit_diff[:,1]) + np.square(fit_diff[:,2]) def huber(a): return a*a/2 if abs(a) <= 1 else abs(a)-0.5 huber = np.vectorize(huber) obj_val = obj_val + huber(fit_diff[:,3]) obj_val = np.sum(obj_val) glrm_obj = glrm_h2o._model_json['output']['objective'] assert abs(glrm_obj - obj_val) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(obj_val)
def predict_churn(State, AccountLength, AreaCode, Phone, IntlPlan, VMailPlan, VMailMessage, DayMins, DayCalls, DayCharge, EveMins, EveCalls, EveCharge, NightMins, NightCalls, NightCharge, IntlMins, IntlCalls, IntlCharge, CustServCalls): # connect to the model scoring service h2o.init(nthreads=1, max_mem_size=1, start_h2o=True, strict_version_check=False) # open the downloaded model ChurnPredictor = h2o.load_model(path='AutoML-leader') # define a feature vector to evaluate with the model newData = pd.DataFrame( { 'State': State, 'Account Length': AccountLength, 'Area Code': AreaCode, 'Phone': Phone, 'Int\'l Plan': IntlPlan, 'VMail Plan': VMailPlan, 'VMail Message': VMailMessage, 'Day Mins': DayMins, 'Day Calls': DayCalls, 'Day Charge': DayCharge, 'Eve Mins': EveMins, 'Eve Calls': EveCalls, 'Eve Charge': EveCharge, 'Night Mins': NightMins, 'Night Calls': NightCalls, 'Night Charge': NightCharge, 'Intl Mins': IntlMins, 'Intl Calls': IntlCalls, 'Intl Charge': IntlCharge, 'CustServ Calls': CustServCalls }, index=[0]) # evaluate the feature vector using the model predictions = ChurnPredictor.predict(h2o.H2OFrame(newData)) predictionsOut = h2o.as_list(predictions, use_pandas=False) prediction = predictionsOut[1][0] probabilityChurn = predictionsOut[1][1] probabilityRetain = predictionsOut[1][2] mySQL_Username = os.environ['BRETT_MYSQL_USERNAME'] mySQL_Password = os.environ['BRETT_MYSQL_PASSWORD'] mySQL_IP = os.environ['BRETT_MYSQL_IP'] engine = create_engine("mysql+mysqldb://" + mySQL_Username + ":" + mySQL_Password + "@" + mySQL_IP + "/customers") predictionsToDB = h2o.as_list(predictions, use_pandas=True) predictionsToDB.to_sql(con=engine, name='predictions', if_exists='append') return "Prediction: " + str(prediction) + " |Probability to Churn: " + str( probabilityChurn) + " |Probability to Retain: " + str( probabilityRetain)
def glrm_nnmf(): m = 1000 n = 100 k = 10 print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)) Y = np.random.rand(k, n) X = np.random.rand(m, k) train = np.dot(X, Y) train_h2o = h2o.H2OFrame(train.tolist()) print("Run GLRM with non-negative regularization") initial_y = np.random.rand(k, n) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="NonNegative", regularization_y="NonNegative", gamma_x=1, gamma_y=1) glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o) glrm_h2o.show() print("Check that X and Y matrices are non-negative") fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame( glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print("Check final objective function value") fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str( glrm_obj) + " but should equal " + str(sse) print("Impute XY and check error metrics") pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose( pred_np, fit_xy ), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output'][ 'training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str( glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str( glrm_caterr) + " but should be zero"
def svd_1_golden(): print "Importing USArrests.csv data..." arrestsH2O = h2o.upload_file( pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) print "Compare with SVD" fitH2O = h2o.svd(x=arrestsH2O[0:4], nv=4, transform="NONE", max_iterations=2000) print "Compare singular values (D)" h2o_d = fitH2O._model_json['output']['d'] r_d = [ 1419.06139509772, 194.825846110138, 45.6613376308754, 18.0695566224677 ] print "R Singular Values: {0}".format(r_d) print "H2O Singular Values: {0}".format(h2o_d) for r, h in zip(r_d, h2o_d): assert abs(r - h) < 1e-6, "H2O got {0}, but R got {1}".format(h, r) print "Compare right singular vectors (V)" h2o_v = h2o.as_list(h2o.get_frame( fitH2O._model_json['output']['v_key']['name']), use_pandas=False) h2o_v = zip(*h2o_v) h2o_v.pop(0) r_v = [[-0.04239181, 0.01616262, -0.06588426, 0.99679535], [-0.94395706, 0.32068580, 0.06655170, -0.04094568], [-0.30842767, -0.93845891, 0.15496743, 0.01234261], [-0.10963744, -0.12725666, -0.98347101, -0.06760284]] print "R Right Singular Vectors: {0}".format(r_v) print "H2O Right Singular Vectors: {0}".format(h2o_v) for rl, hl in zip(r_v, h2o_v): for r, h in zip(rl, hl): assert abs(abs(r) - abs(float(h)) ) < 1e-5, "H2O got {0}, but R got {1}".format(h, r) print "Compare left singular vectors (U)" h2o_u = h2o.as_list(h2o.get_frame( fitH2O._model_json['output']['u_key']['name']), use_pandas=False) h2o_u = zip(*h2o_u) h2o_u.pop(0) r_u = [[-0.1716251, 0.096325710, 0.06515480, 0.15369551], [-0.1891166, 0.173452566, -0.42665785, -0.17801438], [-0.2155930, 0.078998111, 0.02063740, -0.28070784], [-0.1390244, 0.059889811, 0.01392269, 0.01610418], [-0.2067788, -0.009812026, -0.17633244, -0.21867425], [-0.1558794, -0.064555293, -0.28288280, -0.11797419]] print "R Left Singular Vectors: {0}".format(r_u) print "H2O Left Singular Vectors: {0}".format(h2o_u) for rl, hl in zip(r_u, h2o_u): for r, h in zip(rl, hl): assert abs(abs(r) - abs(float(h)) ) < 1e-5, "H2O got {0}, but R got {1}".format(h, r)
def auc(m, v, t): y_true = v[t] y_scores = m.predict(v) y_true = h2o.as_list(y_true, use_pandas=True).values y_scores = h2o.as_list(y_scores, use_pandas=True).values d = roc_auc_score(y_true, y_scores) # score.append(d) print('AUC:', d) return d del m
def distance_check_without_empty_strings(): x = h2o.H2OFrame.from_python(['Martha', 'Dwayne', 'Dixon'], column_types=['factor']) y = h2o.H2OFrame.from_python(['Marhta', 'Duane', ''], column_types=['string']) dist = x.strdistance(y, measure="jw", compare_empty=False) dist_list = h2o.as_list(dist, use_pandas=False, header=False) # compare without last value as it is empty list tst.assert_allclose([float(c[0]) for c in dist_list[0:2]], [0.961111, 0.84], atol=0.001) # compare that last value os NA dist_na_list = h2o.as_list(dist.isna(), use_pandas=False, header=False) assert dist_na_list == [['0'], ['0'], ['1']]
def deep_1( K, dfs, dfs_collector, test, test_collector ): r = 'deep_1' features = on_top2 val_hf = h2o.H2OFrame(test) ntrees = 100 seed = 1155 v = np.zeros(shape=[len(test)]) for i in range(K): print() print('in model:', r, ' k-fold:', i + 1, '/', K) print() b = [i for i in range(K)] b.remove(i) c = [dfs[b[j]] for j in range(K - 1)] dt = pd.concat(c) train_hf = h2o.H2OFrame(dt) del dt dfs_i = h2o.H2OFrame(dfs[i]) # features = list(train_hf.columns) features.remove('target') print('- ' * 10) for c in features: print("'{}',".format(c)) print('- ' * 10) model = H2ODeepLearningEstimator(hidden=[200,200], epochs=500) model.train(x=features, y='target', training_frame=train_hf) del train_hf p = model.predict(dfs_i) dfs_collector[i][r] = h2o.as_list(p, use_pandas=True).values print(dfs_collector[i].head()) print(dfs_collector[i].head().dtypes) q = model.predict(val_hf) dd = h2o.as_list(q, use_pandas=True) a = dd['predict'] a = np.array(a, dtype=pd.Series).tolist() # print(type(a)) # print(a.shape) v += a print('# ' * 10) for show_v in range(5): print(v[show_v]) print('# ' * 10) test_collector[r] = v / K print(test_collector.head()) return dfs_collector, test_collector, r
def glrm_nnmf(): m = 1000 n = 100 k = 10 print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n) Y = np.random.rand(k, n) X = np.random.rand(m, k) train = np.dot(X, Y) train_h2o = h2o.H2OFrame.fromPython(zip(*train.tolist())) print "Run GLRM with non-negative regularization" initial_y = np.random.rand(n, k) initial_y_h2o = h2o.H2OFrame.fromPython(initial_y.tolist()) glrm_h2o = H2OGeneralizedLowRankEstimator( k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="NonNegative", regularization_y="NonNegative", gamma_x=1, gamma_y=1, ) glrm_h2o.train(x=train_h2o.names, training_frame=train_h2o) glrm_h2o.show() print "Check that X and Y matrices are non-negative" fit_y = glrm_h2o._model_json["output"]["archetypes"].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_x = h2o.get_frame(glrm_h2o._model_json["output"]["representation_name"]) fit_x_np = np.array(h2o.as_list(fit_x)) assert np.all(fit_y_np >= 0), "Y must contain only non-negative elements" assert np.all(fit_x_np >= 0), "X must contain only non-negative elements" print "Check final objective function value" fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json["output"]["objective"] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json["output"]["training_metrics"]._metric_json["numerr"] glrm_caterr = glrm_h2o._model_json["output"]["training_metrics"]._metric_json["caterr"] assert abs(glrm_numerr - glrm_obj) < 1e-3, ( "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) ) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def glrm_unitonesparse(): m = 1000 n = 100 k = 10 print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)) Y = np.random.rand(k,n) def ind_list(k): tmp = [0] * k tmp[np.random.randint(0,k)] = 1 return tmp X = [ind_list(k) for x in range(m)] X = np.array(X) train = np.dot(X,Y) train_h2o = h2o.H2OFrame(list(zip(*train.tolist()))) print("Run GLRM with unit one-sparse regularization on X") initial_y = np.random.rand(k,n) initial_y_h2o = h2o.H2OFrame(list(zip(*initial_y.tolist()))) glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.train(x=train_h2o.names,training_frame=train_h2o) # glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.show() print("Check that X matrix consists of rows of basis vectors") fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) def is_basis(a): zeros = np.where(a == 0)[0].size ones = np.where(a == 1)[0].size basis = ones == 1 and (zeros + ones) == k assert basis, "Got " + str(ones) + " ones and " + str(zeros) + " zeros, but expected all zeros except a single 1" return basis np.apply_along_axis(is_basis, 1, fit_x_np) print("Check final objective function value") fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print("Impute XY and check error metrics") pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def vec_as_list(): iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) res = h2o.as_list(iris[0], use_pandas=False) assert abs(float(res[0][4]) - 4.6) < 1e-10 and abs(float(res[0][6]) - 5.4) < 1e-10 and \ abs(float(res[0][10]) - 4.9) < 1e-10, "incorrect values" res = 2 - iris res = h2o.as_list(res[0], use_pandas=False) assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[0][18]) - -3.1) < 1e-10 and \ abs(float(res[0][25]) - -2.8) < 1e-10, "incorrect values"
def glrm_simplex(): m = 1000 n = 100 k = 10 print("Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n)) Y = np.random.rand(k,n) def ind_list(k): tmp = [0] * k tmp[np.random.randint(0,k)] = 1 return tmp X = [ind_list(k) for x in range(m)] X = np.array(X) train = np.dot(X,Y) train_h2o = h2o.H2OFrame(train.tolist()) print("Run GLRM with quadratic mixtures (simplex) regularization on X") initial_y = np.random.rand(k,n) initial_y_h2o = h2o.H2OFrame(initial_y.tolist()) glrm_h2o = H2OGeneralizedLowRankEstimator(k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.train(x=train_h2o.names,training_frame=train_h2o) # glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="Simplex", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.show() print("Check that X matrix consists of rows within standard probability simplex") fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) def is_simplex(a): row_sum = sum(a) simplex = abs(row_sum - 1) < 1e-6 assert simplex, "Got sum over row = " + row_sum + ", but expected 1" return simplex np.apply_along_axis(is_simplex, 1, fit_x_np) print("Check final objective function value") fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print("Impute XY and check error metrics") pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def glrm_unitonesparse(): m = 1000 n = 100 k = 10 print "Uploading random uniform matrix with rows = " + str(m) + " and cols = " + str(n) Y = np.random.rand(k,n) def ind_list(k): tmp = [0] * k tmp[np.random.randint(0,k)] = 1 return tmp X = [ind_list(k) for x in xrange(m)] X = np.array(X) train = np.dot(X,Y) train_h2o = h2o.H2OFrame(zip(*train.tolist())) print "Run GLRM with unit one-sparse regularization on X" initial_y = np.random.rand(k,n) initial_y_h2o = h2o.H2OFrame(zip(*initial_y.tolist())) glrm_h2o = h2o.glrm(x=train_h2o, k=k, init="User", user_y=initial_y_h2o, loss="Quadratic", regularization_x="UnitOneSparse", regularization_y="None", gamma_x=1, gamma_y=0) glrm_h2o.show() print "Check that X matrix consists of rows of basis vectors" fit_x = h2o.get_frame(glrm_h2o._model_json['output']['representation_name']) fit_x_np = np.array(h2o.as_list(fit_x)) def is_basis(a): zeros = np.where(a == 0)[0].size ones = np.where(a == 1)[0].size basis = ones == 1 and (zeros + ones) == k assert basis, "Got " + str(ones) + " ones and " + str(zeros) + " zeros, but expected all zeros except a single 1" return basis np.apply_along_axis(is_basis, 1, fit_x_np) print "Check final objective function value" fit_y = glrm_h2o._model_json['output']['archetypes'].cell_values fit_y_np = [[float(s) for s in list(row)[1:]] for row in fit_y] fit_y_np = np.array(fit_y_np) fit_xy = np.dot(fit_x_np, fit_y_np) glrm_obj = glrm_h2o._model_json['output']['objective'] sse = np.sum(np.square(train.__sub__(fit_xy))) assert abs(glrm_obj - sse) < 1e-6, "Final objective was " + str(glrm_obj) + " but should equal " + str(sse) print "Impute XY and check error metrics" pred_h2o = glrm_h2o.predict(train_h2o) pred_np = np.array(h2o.as_list(pred_h2o)) assert np.allclose(pred_np, fit_xy), "Imputation for numerics with quadratic loss should equal XY product" glrm_numerr = glrm_h2o._model_json['output']['training_metrics']._metric_json['numerr'] glrm_caterr = glrm_h2o._model_json['output']['training_metrics']._metric_json['caterr'] assert abs(glrm_numerr - glrm_obj) < 1e-3, "Numeric error was " + str(glrm_numerr) + " but should equal final objective " + str(glrm_obj) assert glrm_caterr == 0, "Categorical error was " + str(glrm_caterr) + " but should be zero"
def vec_as_list(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) res = h2o.as_list(iris[0], use_pandas=False) assert abs(float(res[0][4]) - 4.6) < 1e-10 and abs(float(res[0][6]) - 5.4) < 1e-10 and \ abs(float(res[0][10]) - 4.9) < 1e-10, "incorrect values" res = 2 - iris res = h2o.as_list(res[0], use_pandas=False) assert abs(float(res[0][4]) - -2.6) < 1e-10 and abs(float(res[0][18]) - -3.1) < 1e-10 and \ abs(float(res[0][25]) - -2.8) < 1e-10, "incorrect values"
def compare_frames(expected, actual): assert actual.shape == expected.shape assert actual.columns == expected.columns, "Columns differ: %r vs %r" % (actual.columns, colnames) for i in range(len(actual.columns)): colname = actual.columns[i] t1 = expected.types[colname] t2 = actual.types[colname] assert t1 == t2, ("Bad types %s: expected %s, got %s" %(colname, t1, t2)) col1 = expected[colname] s1 = str(h2o.as_list(col1)) col2 = actual[colname] s2 = str(h2o.as_list(col2)) assert s1 == s2, ("bad values: expected[%d] = %r, actual[%d] = %r" % (i, s1, i, s2))
def svd_1_golden(): print("Importing USArrests.csv data...") arrestsH2O = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) print("Compare with SVD") from h2o.transforms.decomposition import H2OSVD fitH2O = H2OSVD(nv=4, transform="NONE", max_iterations=2000) fitH2O.train(x=list(range(4)), training_frame=arrestsH2O) print("Compare singular values (D)") h2o_d = fitH2O._model_json["output"]["d"] r_d = [1419.06139509772, 194.825846110138, 45.6613376308754, 18.0695566224677] print("R Singular Values: {0}".format(r_d)) print("H2O Singular Values: {0}".format(h2o_d)) for r, h in zip(r_d, h2o_d): assert abs(r - h) < 1e-6, "H2O got {0}, but R got {1}".format(h, r) print("Compare right singular vectors (V)") h2o_v = h2o.as_list(h2o.get_frame(fitH2O._model_json["output"]["v_key"]["name"]), use_pandas=False) h2o_v.pop(0) r_v = [ [-0.04239181, 0.01616262, -0.06588426, 0.99679535], [-0.94395706, 0.32068580, 0.06655170, -0.04094568], [-0.30842767, -0.93845891, 0.15496743, 0.01234261], [-0.10963744, -0.12725666, -0.98347101, -0.06760284], ] print("R Right Singular Vectors: {0}".format(r_v)) print("H2O Right Singular Vectors: {0}".format(h2o_v)) for rl, hl in zip(r_v, h2o_v): for r, h in zip(rl, hl): assert abs(abs(r) - abs(float(h))) < 1e-5, "H2O got {0}, but R got {1}".format(h, r) print("Compare left singular vectors (U)") h2o_u = h2o.as_list(h2o.get_frame(fitH2O._model_json["output"]["u_key"]["name"]), use_pandas=False) h2o_u.pop(0) r_u = [ [-0.1716251, 0.096325710, 0.06515480, 0.15369551], [-0.1891166, 0.173452566, -0.42665785, -0.17801438], [-0.2155930, 0.078998111, 0.02063740, -0.28070784], [-0.1390244, 0.059889811, 0.01392269, 0.01610418], [-0.2067788, -0.009812026, -0.17633244, -0.21867425], [-0.1558794, -0.064555293, -0.28288280, -0.11797419], ] print("R Left Singular Vectors: {0}".format(r_u)) print("H2O Left Singular Vectors: {0}".format(h2o_u)) for rl, hl in zip(r_u, h2o_u): for r, h in zip(rl, hl): assert abs(abs(r) - abs(float(h))) < 1e-5, "H2O got {0}, but R got {1}".format(h, r)
def distance_check(): x = h2o.H2OFrame.from_python(['Martha', 'Dwayne', 'Dixon'], column_types=['factor']) y = h2o.H2OFrame.from_python(['Marhta', 'Duane', 'Dicksonx'], column_types=['string']) dist = x.strdistance(y, measure="jw") dist_list = h2o.as_list(dist, use_pandas=False, header=False) tst.assert_allclose([float(c[0]) for c in dist_list], [0.961111, 0.84, 0.813333], atol=0.001)
def save_histogram(dataset, feature, max_value=100): sns.set() x = h2o.as_list(dataset[feature]).values ax = sns.distplot(x) ax.set(xlim=(0, max_value)) fig = ax.get_figure() fig.savefig(feature + "_hist.png")
def vec_as_list(ip, port): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris_wheader.csv")) res = h2o.as_list(iris[0], use_pandas=False) assert abs(float(res[4][0]) - 4.6) < 1e-10 and abs(float(res[6][0]) - 5.4) < 1e-10 and \ abs(float(res[10][0]) - 4.9) < 1e-10, "incorrect values" res = 2 - iris res2 = h2o.as_list(res[0], use_pandas=False) assert abs(float(res2[4][0]) - -2.6) < 1e-10 and abs(float(res2[18][0]) - -3.1) < 1e-10 and \ abs(float(res2[25][0]) - -2.8) < 1e-10, "incorrect values" res3 = h2o.as_list(res[1], use_pandas=False) assert abs(float(res3[4][0]) - -1.1) < 1e-10 and abs(float(res3[6][0]) - -1.9) < 1e-10 and \ abs(float(res3[10][0]) - -1.1) < 1e-10, "incorrect values"
def h2o_grid(): h2o.init() data = h2o.import_file('output/diamonds_PCA.csv') splits = data.split_frame(ratios=[0.7, 0.15], seed=1) train = splits[0] valid = splits[1] test = splits[2] y = 'price' x = list(data.columns) x.remove(y) hyper_parameters = {'learn_rate': [0.01, 0.1],'max_depth': [3, 5, 9], 'sample_rate': [0.8, 1.0],'col_sample_rate': [0.2, 0.5, 1.0]} gs = H2OGridSearch(H2OGradientBoostingEstimator,hyper_parameters) gs.train(x = x,y=y, training_frame=train,validation_frame=valid) gs1=gs.get_grid(sort_by='rmse',decreasing=True) best_m=gs1.models[0] best_mp=best_m.model_performance(test) print(best_mp.rmse()) test = h2o.import_file('output/diamonds_test_PCA.csv') predict=best_m.predict(test) predict=h2o.as_list(predict) predict.to_csv('output/pred_h2o.csv')
def sdev(ip,port): # Connect to h2o h2o.init(ip,port) iris_h2o = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"), delimiter=',', skip_header=1, usecols=(0, 1, 2, 3)) sd_np = np.std(iris_np, axis=0, ddof=1) for i in range(4): sd_h2o = h2o.as_list(iris_h2o[i].sd())[0][0] assert abs(sd_np[i] - sd_h2o) < 1e-10, "expected standard deviations to be the same" try: iris_h2o[4].sd().eager() assert False, "expected an error. column is categorical." except EnvironmentError: assert True try: iris_h2o[0:2].sd().eager() assert False, "expected an error. more than one column." except AttributeError: assert True
def _from_frame(frame): """Create numpy array from H2OFrame object """ preds = h2o.as_list(frame, use_pandas=False) preds.pop(0) [r.pop(0) for r in preds] return np.asarray(preds, dtype=np.float)
def convert_h2o_list(lst): """ Converts an h2o list to a python list :param lst: :return: """ return h2o.as_list(lst)
def evalmodel(df): glm_classifier = h2o.load_model('./model') result = h2o.as_list(glm_classifier.predict(df), use_pandas=False) result.pop(0) #get rid of the column header result = [float(r[0]) for r in result ] #the results are each returned as 1-element lists. fix that. return result
def numeric_quantile_bin(data_df, cal_numeric_cols, nbin=20): """ cut numerical variables into buckets by quantiles :param data_df: a data frame :param cal_numeric_cols: numerical columns to be cut :param nbin: bucket number :return: a data frame after cutting, and a dict with cutting info """ percentiles = [i * 1.0 / nbin for i in range(nbin + 1)] numeric_bin_dict = dict() for col in cal_numeric_cols: break_lst = h2o.as_list(data_df[col].quantile( prob=percentiles, combine_method=u'interpolate')[:, 1], use_pandas=False, header=False) break_lst = [float(i[0]) for i in break_lst] break_labels = [col + '_' + str(i + 1) for i in range(nbin)] data_df[col] = data_df[col].cut(break_lst, labels=break_labels, include_lowest=True, right=True, dig_lab=3) numeric_bin_dict[col] = [break_labels, break_lst] return data_df, numeric_bin_dict
def sdev(ip, port): # Connect to h2o h2o.init(ip, port) iris_h2o = h2o.import_frame( path=h2o.locate("smalldata/iris/iris_wheader.csv")) iris_np = np.genfromtxt(h2o.locate("smalldata/iris/iris_wheader.csv"), delimiter=',', skip_header=1, usecols=(0, 1, 2, 3)) sd_np = np.std(iris_np, axis=0, ddof=1) for i in range(4): sd_h2o = h2o.as_list(iris_h2o[i].sd())[0][0] assert abs(sd_np[i] - sd_h2o ) < 1e-10, "expected standard deviations to be the same" try: iris_h2o[4].sd().eager() assert False, "expected an error. column is categorical." except EnvironmentError: assert True try: iris_h2o[0:2].sd().eager() assert False, "expected an error. more than one column." except AttributeError: assert True
def save_histogram(dataset,feature,max_value=100): sns.set() x = h2o.as_list(dataset[feature]).values ax = sns.distplot(x) ax.set(xlim=(0,max_value)) fig = ax.get_figure() fig.savefig(feature+"_hist.png")
def run(raw_data): data = pd.read_json(raw_data,orient='table') # make prediction h2data = h2o.H2OFrame(data) y_hat = h2o.as_list(model.predict(h2data)) # you can return any data type as long as it is JSON-serializable return y_hat.to_json(orient="table")
def check_leaderboard(aml, excluded_algos, expected_metrics, expected_sort_metric, expected_sorted_desc=False): print("AutoML leaderboard") leaderboard = aml.leaderboard print(leaderboard) # check that correct leaderboard columns exist expected_columns = (['model_id'] + expected_metrics) assert leaderboard.names == expected_columns, \ "expected leaderboard columns to be {expected} but got {actual}".format(expected=expected_columns, actual=leaderboard.names) model_ids = list(h2o.as_list(leaderboard['model_id'])['model_id']) assert len([a for a in excluded_algos if len([b for b in model_ids if a in b]) > 0]) == 0, \ "leaderboard contains some excluded algos among {excluded}: {models}".format(excluded=excluded_algos, models=model_ids) included_algos = list(set(all_algos) - set(excluded_algos)) + ( [] if 'DRF' in excluded_algos else ['XRT']) assert len([a for a in included_algos if len([b for b in model_ids if a in b]) > 0]) == len(included_algos), \ "leaderboard is missing some algos from {included}: {models}".format(included=included_algos, models=model_ids) j_leaderboard = aml._state_json['leaderboard'] if expected_sort_metric is not None: sort_metric = j_leaderboard['sort_metric'] assert sort_metric == expected_sort_metric, \ "expected leaderboard sorted by {expected} but was sorted by {actual}".format(expected=expected_sort_metric, actual=sort_metric) if expected_sorted_desc is not None: sorted_desc = j_leaderboard['sort_decreasing'] assert sorted_desc == expected_sorted_desc, \ "expected leaderboard sorted {expected} but was sorted {actual}".format(expected="desc" if expected_sorted_desc else "asc", actual="desc" if sorted_desc else "asc")
def check_values(h2o_data, numpy_data): success = True for i in range(10): r = random.randint(0,row-1) c = random.randint(0,col-1) if not abs(h2o.as_list(h2o_data[r,c])[0][0] - numpy_data[r,c]) < 1e-06: success = False return success
def hist(self, breaks="Sturges", plot=True, **kwargs): """ Compute a histogram over a numeric column. If breaks=="FD", the MAD is used over the IQR in computing bin width. :param breaks: breaks Can be one of the following: A string: "Sturges", "Rice", "sqrt", "Doane", "FD", "Scott." A single number for the number of breaks splitting the range of the vec into number of breaks bins of equal width. Or, A vector of numbers giving the split points, e.g., c(-50,213.2123,9324834) :param plot: A logical value indicating whether or not a plot should be generated (default is TRUE). :return: if plot is True, then return None, else, an self._newExpr with these columns: breaks, counts, mids_true, mids, and density """ frame = self._newExpr("hist", self, breaks) total = frame["counts"].sum(True) densities = [(frame[i,"counts"]/total)*(1/(frame[i,"breaks"]-frame[i-1,"breaks"])) for i in range(1,frame["counts"].nrow)] densities.insert(0,0) densities_frame = H2OFrame.fromPython(densities) densities_frame.set_names(["density"]) frame = frame.cbind(densities_frame) if plot: try: imp.find_module('matplotlib') import matplotlib if 'server' in kwargs.keys() and kwargs['server']: matplotlib.use('Agg', warn=False) import matplotlib.pyplot as plt except ImportError: print "matplotlib is required to make the histogram plot. Set `plot` to False, if a plot is not desired." return lower = float(frame[0,"breaks"]) clist = h2o.as_list(frame["counts"], use_pandas=False) clist = zip(*clist) clist.pop(0) clist.pop(0) mlist = h2o.as_list(frame["mids"], use_pandas=False) mlist = zip(*mlist) mlist.pop(0) mlist.pop(0) counts = [float(c[0]) for c in clist] counts.insert(0,0) mids = [float(m[0]) for m in mlist] mids.insert(0,lower) plt.xlabel(self.names[0]) plt.ylabel('Frequency') plt.title('Histogram of {0}'.format(self.names[0])) plt.bar(mids, counts) if not ('server' in kwargs.keys() and kwargs['server']): plt.show() else: return frame
def gbm_1_R(K, dfs, dfs_collector, test, test_collector): r = 'gbm_1' on = [] val_hf = h2o.H2OFrame(test) ntrees = 100 seed = 1155 v = np.zeros(shape=[len(test)]) for i in range(K): print() print('in model:', r, ' k-fold:', i + 1, '/', K) print() b = [i for i in range(K)] b.remove(i) c = [dfs[b[j]] for j in range(K - 1)] dt = pd.concat(c) train_hf = h2o.H2OFrame(dt) del dt dfs_i = h2o.H2OFrame(dfs[i]) features = list(train_hf.columns) features.remove('target') model = H2OGradientBoostingEstimator(model_id='gbm_manual', seed=seed, ntrees=ntrees, sample_rate=0.9, col_sample_rate=0.9) model.train(x=features, y='target', training_frame=train_hf) del train_hf p = model.predict(dfs_i) dfs_collector[i][r] = h2o.as_list(p, use_pandas=True).values print(dfs_collector[i].head()) print(dfs_collector[i].head().dtypes) q = model.predict(val_hf) dd = h2o.as_list(q, use_pandas=True) a = dd['predict'] a = np.array(a, dtype=pd.Series).tolist() # print(type(a)) # print(a.shape) v += a test_collector[r] = v / K print(test_collector.head()) return dfs_collector, test_collector, r
def get_partitioned_model_names(leaderboard): model_names = Namespace() model_names.all = list(h2o.as_list(leaderboard['model_id'])['model_id']) model_names.se = [ m for m in model_names.all if m.startswith('StackedEnsemble') ] model_names.base = [m for m in model_names.all if m not in model_names.se] return model_names
def ntrain(): h2o.init(ip="zurich.h2o.ai",strict_version_check=False) weather = load_weather() training = load_training() X = assemble_X(training, weather) mean, std = normalize(X) y =assemble_y(training) xd=[] for l in X: xd.append(l.tolist()) y=np.asarray(y,dtype='bool_') xtr=H2OFrame(python_obj=xd) ytr=H2OFrame(python_obj=y.tolist()) ytr["C1"]._name = "C40" # Rename the default column gb = h2o.gbm(x =xtr[1:39],y =ytr['C40'], distribution = "bernoulli", ntrees=1000, # 500 works well max_depth=12, learn_rate=0.01) dl= h2o.deeplearning(x =xtr[1:39],y =ytr['C40'], variable_importances=True,balance_classes=True, input_dropout_ratio=0.2,rho=0.899, hidden_dropout_ratios=[0.4,0.4,0.4,0.4], activation="Tanh",hidden=[39,325,325,1],epochs=100) rf= h2o.random_forest(x =xtr[1:39],y =ytr['C40'], seed=1234, ntrees=600, max_depth=20, balance_classes=False) testing = load_testing() X_test= assemble_X(testing, weather) normalize(X_test, mean, std) xd=[] for l in X_test: xd.append(l.tolist()) xts=H2OFrame(python_obj=xd) # gp=gb.predict(xts) dp=dl.predict(xts) rp=rf.predict(xts) gbp=gb.predict(xts) gp=dp*0.35+rp*0.3+gbp*0.35 gph=h2o.as_list(gp) Id= np.arange(gp.nrow()+1)[1:].reshape(gp.nrow(),1) df = pd.DataFrame(Id) df_concat = pd.concat([df, gph.True],axis=1) df_concat.columns=['Id','WnvPresent'] df_concat.to_csv("wnvh.csv",index=False)
def _evaluate(request): """ Summarize the column sent as a parameter. Aggregation function. :param request: an iterable sequence of RowData :return: int, sum if column """ params = [] logging.info('_evaluate') logging.info('_evaluate request {}'.format(request)) print(request) # Iterate over bundled rows for request_rows in request: print(request_rows.rows) print(len(request_rows.rows)) # Iterating over rows logging.info('_evaluate request_rows {}'.format(request_rows.rows)) for row in request_rows.rows: # Retrieve numerical value of parameter and append to the params variable # Length of param is 1 since one column is received, the [0] collects the first value in the list param = [d.numData for d in row.duals] logging.info('_evaluate row {}'.format(param)) params.append(param) h2o.init() dir_path = os.path.dirname( os.path.realpath(__file__)) + "\\kmeans_iris" #results = h2o.load_model("C:/Users/daniel/Documents/Qlik Advanced Analytics/Examples/Python/H2O/kmeans_iris") results = h2o.load_model(dir_path) newData = h2o.H2OFrame(params) predictedNew = results.predict(newData) predicted_as_list = h2o.as_list(predictedNew, use_pandas=False) predicted_as_list.pop(0) response_rows = [] for result in predicted_as_list: # Create an iterable of Dual with a numerical value duals = iter([SSE.Dual(numData=int(result[0]))]) # Append the row data constructed to response_rows response_rows.append(SSE.Row(duals=duals)) #print(predicted_as_list) logging.info('_evaluate params {}'.format(params)) logging.info( '_evaluate predicted_as_list {}'.format(predicted_as_list)) # Sum all rows collected the the params variable #result = sum(params[0]) # Create an iterable of dual with numerical value #duals = iter([SSE.Dual(numData=result)]) # Yield the row data constructed yield SSE.BundledRows(rows=response_rows)
def cal_vars_levels_amount(data_df, factor_var): """ :return: """ groupby_lst = h2o.as_list(data_df.group_by(by=factor_var).count().frame, use_pandas=False, header=False) return dict(groupby_lst)
def check_values(h2o_data, np_data): success = True for i in range(10): h2o_val = h2o.as_list(h2o_data[i,0])[0][0] num_val = np_data[i] if not abs(h2o_val - num_val) < 1e-06: success = False print "check unsuccessful! h2o computed {0} and numpy computed {1}".format(h2o_val,num_val) return success
def as_python_test(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/allyears2k.zip")) iris.show() prostate.show() airlines.show() print(h2o.as_list(iris)) print(h2o.as_list(prostate)) print(h2o.as_list(airlines))
def vec_as_list(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) res = h2o.as_list(iris[0], use_pandas=False) assert abs(float(res[4][0]) - 4.6) < 1e-10 and abs(float(res[6][0]) - 5.4) < 1e-10 and \ abs(float(res[10][0]) - 4.9) < 1e-10, "incorrect values" res = 2 - iris res2 = h2o.as_list(H2OVec(name="C0", expr=res[0]._expr), use_pandas=False) assert abs(float(res2[4][0]) - -2.6) < 1e-10 and abs(float(res2[18][0]) - -3.1) < 1e-10 and \ abs(float(res2[25][0]) - -2.8) < 1e-10, "incorrect values" res3 = h2o.as_list(H2OVec(name="C1", expr=res[1]._expr), use_pandas=False) assert abs(float(res3[4][0]) - -1.1) < 1e-10 and abs(float(res3[6][0]) - -1.9) < 1e-10 and \ abs(float(res3[10][0]) - -1.1) < 1e-10, "incorrect values"
def vec_as_list(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) res = h2o.as_list(iris[0], use_pandas=False) assert abs(float(res[4][0]) - 4.6) < 1e-10 and abs(float(res[6][0]) - 5.4) < 1e-10 and \ abs(float(res[10][0]) - 4.9) < 1e-10, "incorrect values" res = 2 - iris res2 = h2o.as_list(H2OVec(name="C0", expr=res[0]._expr), use_pandas=False) assert abs(float(res2[4][0]) - -2.6) < 1e-10 and abs(float(res2[18][0]) - -3.1) < 1e-10 and \ abs(float(res2[25][0]) - -2.8) < 1e-10, "incorrect values" res3 = h2o.as_list(H2OVec(name="C1", expr=res[1]._expr), use_pandas=False) assert abs(float(res3[4][0]) - -1.1) < 1e-10 and abs(float(res3[6][0]) - -1.9) < 1e-10 and \ abs(float(res3[10][0]) - -1.1) < 1e-10, "incorrect values"
def vec_as_list(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) res = h2o.as_list(iris[0]) assert abs(res[3][0] - 4.6) < 1e-10 and abs(res[5][0] - 5.4) < 1e-10 and abs(res[9][0] - 4.9) < 1e-10, \ "incorrect values" res = 2 - iris res2 = h2o.as_list(H2OVec(name="C0", expr=res[0])) assert abs(res2[3][0] - -2.6) < 1e-10 and abs(res2[17][0] - -3.1) < 1e-10 and abs(res2[24][0] - -2.8) < 1e-10, \ "incorrect values" res3 = h2o.as_list(H2OVec(name="C1", expr=res[1])) assert abs(res3[3][0] - -1.1) < 1e-10 and abs(res3[5][0] - -1.9) < 1e-10 and abs(res3[9][0] - -1.1) < 1e-10, \ "incorrect values"
def vec_as_list(): iris = h2o.import_file(path=tests.locate("smalldata/iris/iris_wheader.csv")) res = h2o.as_list(iris[0], use_pandas=False) assert abs(float(res[4][0]) - 4.6) < 1e-10 and abs(float(res[6][0]) - 5.4) < 1e-10 and \ abs(float(res[10][0]) - 4.9) < 1e-10, "incorrect values" res = 2 - iris res2 = h2o.as_list(res[0], use_pandas=False) assert abs(float(res2[4][0]) - -2.6) < 1e-10 and abs(float(res2[18][0]) - -3.1) < 1e-10 and \ abs(float(res2[25][0]) - -2.8) < 1e-10, "incorrect values" res3 = h2o.as_list(res[1], use_pandas=False) assert abs(float(res3[4][0]) - -1.1) < 1e-10 and abs(float(res3[6][0]) - -1.9) < 1e-10 and \ abs(float(res3[10][0]) - -1.1) < 1e-10, "incorrect values"
def compare_frames(d1 = saving_meanImputed_fp, d2 = saving_modelImputed_fp, imputed = to_impute): print "Comparing the resulting two matrices..." # Load the saved frames back in meanI = h2o.import_file(path = d1) modelI = h2o.import_file(path = d2) meanIquantiles = h2o.as_list(meanI[imputed].quantile(prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99])) modelIquantiles = h2o.as_list(modelI[imputed].quantile(prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99])) meanIcolmeans = [v.mean() for v in meanI[imputed]] modelIcolmeans = [v.mean() for v in modelI[imputed]] meanIcolmedians = [v.median() for v in meanI[imputed]] modelIcolmedians = [v.median() for v in modelI[imputed]] meanIcolmin = [v.min() for v in meanI[imputed]] modelIcolmin = [v.min() for v in modelI[imputed]]
def frame_as_list(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) prostate = h2o.import_frame(path=h2o.locate("smalldata/prostate/prostate.csv.zip")) airlines = h2o.import_frame(path=h2o.locate("smalldata/airlines/allyears2k.zip")) res1 = h2o.as_list(iris) assert abs(res1[8][0] - 4.4) < 1e-10 and abs(res1[8][1] - 2.9) < 1e-10 and abs(res1[8][2] - 1.4) < 1e-10, \ "incorrect values" res2 = h2o.as_list(prostate) assert abs(res2[6][0] - 7) < 1e-10 and abs(res2[6][1] - 0) < 1e-10 and abs(res2[6][2] - 68) < 1e-10, \ "incorrect values" res3 = h2o.as_list(airlines) assert abs(res3[3][0] - 1987) < 1e-10 and abs(res3[3][1] - 10) < 1e-10 and abs(res3[3][2] - 18) < 1e-10, \ "incorrect values"
def vec_slicing(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) iris.show() ################################################################### # H2OVec[int] res = 2 - iris res2 = h2o.as_list(H2OVec(name="C0", expr=res[0])) assert abs(res2[3][0] - -2.6) < 1e-10 and abs(res2[17][0] - -3.1) < 1e-10 and abs(res2[24][0] - -2.8) < 1e-10, \ "incorrect values" # H2OVec[slice] res = iris[1][12:25] res3 = h2o.as_list(res) assert abs(res3[0][0] - 3.0) < 1e-10 and abs(res3[1][0] - 3.0) < 1e-10 and abs(res3[5][0] - 3.5) < 1e-10, \ "incorrect values"
def levels(self, col=None): """ Get the factor levels for this frame and specified columns :param col: A column index in this H2OFrame :return: A list of lists of strings that are the factor levels for columns. """ fr = self if col is None else self._newExpr("cols", self, col) lol = h2o.as_list(self._newExpr("levels", fr), False) for l in lol: l.pop(0) # Remove column headers return lol
def check_values(h2o_data, numpy_data): success = True for i in range(10): r = random.randint(0,row-1) c = random.randint(0,col-1) h2o_val = h2o.as_list(h2o_data[r,c])[0][0] num_val = numpy_data[r,c] if not abs(h2o_val - num_val) < 1e-06: success = False print "check unsuccessful! h2o computed {0} and numpy computed {1}".format(h2o_val,num_val) return success
def level(self, col=None): """ Get the factor levels for this single column :param col: A column index in this H2OFrame :return: a list of strings that are the factor levels for the one column. """ fr = self if col is None else self._newExpr("cols", self, col) if fr.ncol > 1: raise ValueError("level takes only a single column") l = h2o.as_list(self._newExpr("levels", fr), False)[0] l.pop(0) # Remove column header return l