def ls_test(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) h2o.ls()
def ls_test(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) h2o.ls()
def ls_test(ip,port): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) h2o.ls()
def ls_test(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv")) h2o.ls()
def rename_things(): fr = h2o.import_file(tests.locate("smalldata/logreg/prostate.csv")) fr.frame_id = "mooochooo" print(h2o.ls()) zz = fr[1:2] zz.show() zz.frame_id = "black_sheep_LLC" print(h2o.ls()) from h2o.estimators.gbm import H2OGradientBoostingEstimator m = H2OGradientBoostingEstimator(ntrees=5, max_depth=2) m.train(x=fr.names[2:], y=fr.names[1], training_frame=fr) print(m.model_id) m.model_id = "my_gbm_model_wwwww" print(h2o.ls()) print(h2o.get_model("my_gbm_model_wwwww")) print(h2o.ls())
def pyunit_deep_copy(): pros_1 = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) pros_2 = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) pros_copy_1 = h2o.deep_copy(pros_1, "copy") pros_copy_2 = h2o.deep_copy(pros_2, "copy2") #Change a part of the original frame and a copied frame. It is expected in a deep copy that changing the original #frame will not effect the duplicate and vice versa pros_1.insert_missing_values() pros_copy_2.insert_missing_values() print("Original Frame with inserted missing values:") print(pros_1) print("Duplicate Frame with no inserted missing values") print(pros_copy_1) print("Original Frame with no inserted missing values:") print(pros_2) print("Duplicate Frame with inserted missing values") print(pros_copy_2) print("Number of frames in session after deep_copy") print(h2o.ls()) assert pros_1.nacnt() != pros_copy_1.nacnt( ), "Inserted NA's into the original frame but the original seems to match the duplicates NA count!" assert pros_2.nacnt() != pros_copy_2.nacnt( ), "Inserted NA's into the duplicate frame but the original seems to match the originals NA count!"
def save_all_frames(self, path, overwrite=False): """Save all models to a directory. :param path: String path, where to save your models. :param overwrite: boolean, overwrite the frame """ models = [] for f in h2o.ls()['key']: if 'modelmetrics' not in f: try: fh = h2o.get_frame(f) except (h2o.exceptions.H2OResponseError, h2o.exceptions.H2OServerError): pass else: try: # quick and dirty solution for NoneType fh.frame_id except: pass else: print(fh.frame_id) print("Save frame " + fh.frame_id + " to " + path + "/" + fh.frame_id) h2o.export_file(fh, path=path + os.sep + fh.frame_id, force=overwrite)
def h2ols(): """ Python API test: h2o.ls() """ iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv")) lsObject = h2o.ls() # check return type as DataFrame assert_is_type(lsObject, DataFrame) # check that our frame info was included in the lsObject assert lsObject.values[0][0] == str(iris.frame_id), \ "Frame info iris.hex should have been found but h2o.ls() command failed."
def retain_keys_test(): airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) gbm = H2OGradientBoostingEstimator(ntrees=1) gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines) h2o.remove_all([airlines.frame_id, gbm.model_id]) assert h2o.get_frame(airlines.frame_id) is not None assert h2o.get_model(gbm.model_id) is not None ## Test key not being retained when unspecified gbm = H2OGradientBoostingEstimator(ntrees=1) gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines) h2o.remove_all([airlines.frame_id]) h2o.ls() try: h2o.get_model(gbm.model_id) assert False except h2o.exceptions.H2OResponseError as e: assert e.args[0].dev_msg.find("not found for argument: key") != -1
def test_target_encoding_regression(): ds = import_dataset(mode='regression') aml = H2OAutoML(project_name="automl_with_te_regression", max_models=5, preprocessing=['target_encoding'], seed=1) aml.train(y=ds.target, training_frame=ds.train, leaderboard_frame=ds.test) lb = aml.leaderboard print(lb) # we can't really verify from client if TE was correctly applied... so just using a poor man's check: mem_keys = h2o.ls().key # print(mem_keys) assert any(k.startswith("TargetEncoding_AutoML") for k in mem_keys)
def test_target_encoding_multiclass(): ds = import_dataset(mode='multiclass') aml = H2OAutoML(project_name="automl_with_te_multiclass", max_models=5, preprocessing=['target_encoding'], seed=1) aml.train(y=ds.target, training_frame=ds.train, leaderboard_frame=ds.test) lb = aml.leaderboard print(lb) # we can't really verify from client if TE was correctly applied... so just using a poor man's check: mem_keys = h2o.ls().key # print(mem_keys) assert any(k.startswith("TargetEncoding_AutoML") for k in mem_keys) for mid in get_partitioned_model_names(lb).all: check_mojo_pojo_availability(mid)
def wipe_h2o_cluster(self): """ This function wipes all archives from the h2O cluster, the same effect as restarting the instance. We do this to prevent old models or grid searches from being accedently being considered in current or future calculations. """ h_objects = h2o.ls() logging.info(h_objects) for key in h_objects['key']: # This try/except exists because the server is throwing an H2OServer # error when attempting to delete the custom loss metric. # TODO: Why does the this fail when deleting cost_matrix_loss_metric? try: h2o.remove(key) except: logging.info(f"Error while attempting to remove {key}")
def list_keys_in_memory(project_name=None): mem_keys = h2o.ls().key automl_keys = [ k for k in mem_keys if re.search(r'_AutoML_', k) and ( project_name is None or project_name not in k) ] automl_frame_keys = [k for k in mem_keys if re.search(r'^levelone_', k)] prediction_keys = [ k for k in mem_keys if re.search(r'(^|_)prediction_', k) ] metrics_keys = [k for k in mem_keys if re.search(r'^modelmetrics_', k)] metalearner_keys = [k for k in mem_keys if re.search(r'^metalearner', k)] fold_keys = [k for k in mem_keys if re.search(r'_fold_', k)] all_model_keys = [ k for k in automl_keys if k not in automl_frame_keys and k not in prediction_keys and k not in metrics_keys and k not in fold_keys ] cv_keys = [k for k in mem_keys if re.search(r'(^|_)cv_', k)] cv_prediction_keys = [k for k in cv_keys if k in prediction_keys] cv_metrics_keys = [k for k in cv_keys if k in metrics_keys] cv_fold_assignment = [k for k in cv_keys if k in fold_keys] cv_model_keys = [ k for k in cv_keys if k in all_model_keys and k not in cv_fold_assignment ] base_model_keys = [ k for k in all_model_keys if k not in cv_keys and k not in metalearner_keys ] return dict( all=mem_keys, models_all=all_model_keys, models_base=base_model_keys, predictions=prediction_keys, metrics=metrics_keys, automl=automl_keys, cv_all=cv_keys, cv_models=cv_model_keys, cv_predictions=cv_prediction_keys, cv_metrics=cv_metrics_keys, cv_fold_assignment=cv_fold_assignment, metalearners=metalearner_keys, )
def benchmark(model, model_name, params, initTime, trainTime): row = [ config.cluster, config.nthreads, experimentName, -1, model_name, str(params), initTime.total_seconds(), trainTime.total_seconds(), 0, 0 ] metricsIdx = len(row) - 2 metricsTime = dt.now() for data in [trdata, vadata, tedata]: metrics = model.model_performance(test_data=data) err = get_classification_error(metrics) appendVal(row, lambda: 1 - err) appendVal(row, lambda: err) appendVal(row, lambda: metrics.F1()) appendVal(row, lambda: metrics.fnr()) appendVal(row, lambda: metrics.fpr()) appendVal(row, lambda: metrics.tnr()) appendVal(row, lambda: metrics.tpr()) appendVal(row, lambda: metrics.precision()) appendVal(row, lambda: metrics.recall()) appendVal(row, lambda: metrics.sensitivity()) appendVal(row, lambda: metrics.specificity()) appendVal(row, lambda: metrics.aic()) appendVal(row, lambda: metrics.auc()) appendVal(row, lambda: metrics.logloss()) appendVal(row, lambda: metrics.mean_residual_deviance()) appendVal(row, lambda: metrics.mse()) appendVal(row, lambda: metrics.null_degrees_of_freedom()) appendVal(row, lambda: metrics.null_deviance()) appendVal(row, lambda: metrics.r2()) appendVal(row, lambda: metrics.residual_degrees_of_freedom()) appendVal(row, lambda: metrics.residual_deviance()) metricsTime = dt.now() - metricsTime row[metricsIdx] = metricsTime.total_seconds() row[metricsIdx + 1] = (initTime + trainTime + metricsTime).total_seconds() row = map( lambda x: None if isinstance(x, numbers.Number) and (x is None or np.isnan(x)) or x == u"NaN" or x == "NaN" else x, row) persist(row) for [frame] in h2o.ls().as_matrix(): if not keep_frames.match(frame): h2o.remove(frame)
def list_keys_in_memory(): mem_keys = h2o.ls().key automl_keys = [k for k in mem_keys if re.search(r'_AutoML_', k)] pred_keys = [k for k in mem_keys if re.search(r'(^|_)prediction_', k)] metrics_keys = [k for k in mem_keys if re.search(r'^modelmetrics_', k)] model_keys = [k for k in automl_keys if k not in pred_keys and k not in metrics_keys] cv_keys = [k for k in mem_keys if re.search(r'(^|_)cv_', k)] cv_pred_keys = [k for k in cv_keys if k in pred_keys] cv_metrics_keys = [k for k in cv_keys if k in metrics_keys] cv_mod_keys = [k for k in cv_keys if k in model_keys] return dict( all=mem_keys, models=model_keys, predictions=pred_keys, metrics=metrics_keys, automl=automl_keys, cv_all=cv_keys, cv_models=cv_mod_keys, cv_predictions=cv_pred_keys, cv_metrics=cv_metrics_keys )
def list_keys_in_memory(): mem_keys = h2o.ls().key gbm_keys = [k for k in mem_keys if re.search(r'^Grid_GBM_.*_model_\d+(_|$)', k)] pred_keys = [k for k in mem_keys if re.search(r'(^|_)prediction_', k)] metrics_keys = [k for k in mem_keys if re.search(r'^modelmetrics_', k)] model_keys = [k for k in gbm_keys if k not in pred_keys and k not in metrics_keys] cv_keys = [k for k in mem_keys if re.search(r'(^|_)cv_', k)] cv_pred_keys = [k for k in cv_keys if k in pred_keys] cv_metrics_keys = [k for k in cv_keys if k in metrics_keys] cv_mod_keys = [k for k in cv_keys if k in model_keys] return dict( all=mem_keys, models=model_keys, predictions=pred_keys, metrics=metrics_keys, gbm=gbm_keys, cv_all=cv_keys, cv_models=cv_mod_keys, cv_predictions=cv_pred_keys, cv_metrics=cv_metrics_keys )
def list_keys_in_memory(): mem_keys = h2o.ls().key gbm_keys = [ k for k in mem_keys if re.search(r'^Grid_GBM_.*_model_\d+(_|$)', k) ] pred_keys = [k for k in mem_keys if re.search(r'(^|_)prediction_', k)] metrics_keys = [k for k in mem_keys if re.search(r'^modelmetrics_', k)] model_keys = [ k for k in gbm_keys if k not in pred_keys and k not in metrics_keys ] cv_keys = [k for k in mem_keys if re.search(r'(^|_)cv_', k)] cv_pred_keys = [k for k in cv_keys if k in pred_keys] cv_metrics_keys = [k for k in cv_keys if k in metrics_keys] cv_mod_keys = [k for k in cv_keys if k in model_keys] return dict(all=mem_keys, models=model_keys, predictions=pred_keys, metrics=metrics_keys, gbm=gbm_keys, cv_all=cv_keys, cv_models=cv_mod_keys, cv_predictions=cv_pred_keys, cv_metrics=cv_metrics_keys)
def turn_on(): # turn on H2O h2o.init() h2o.ls()
def execute(self, name, x, y, training_frame, validation_frame, test_frame, subset_coef): params = grid.ParameterGrid(self.params_grid) if self.params_grid is None or len(self.params_grid) == 0: params = ["default"] results = [] dt = datetime.datetime # R stuff ri.initr() h2or = importr("h2o") h2o_ensemble = importr("h2oEnsemble") base = importr("base") stats = importr("stats") cvauc = importr("cvAUC") h2or.h2o_init(ip=config.hostname, port=config.port, startH2O=False) # Add some base learners with open("{}/R/wrappers.r".format(os.path.dirname(__file__)), "r") as f: ro.r("\n".join(f.readlines())) keep_frames = re.compile("|".join([ training_frame.frame_id, validation_frame.frame_id, test_frame.frame_id ]) + "|.*\\.hex|py_.*") for p in params: row = [ config.cluster, config.nthreads, name, subset_coef, self.name, str(p) ] # Initialize the model init_time = dt.now() # get frame names # load it in R train = h2or.h2o_getFrame(training_frame.frame_id) valid = h2or.h2o_getFrame(validation_frame.frame_id) test = h2or.h2o_getFrame(test_frame.frame_id) init_time = dt.now() - init_time # Train the model train_time = dt.now() if p == "default": model = h2o_ensemble.h2o_ensemble(x=toR(x), y=y, training_frame=train, validation_frame=valid) else: p = {k: toR(v) for k, v in p.items()} model = h2o_ensemble.h2o_ensemble(x=toR(x), y=y, training_frame=train, validation_frame=valid, **p) train_time = dt.now() - train_time # Model metrics metrics_time = dt.now() RpredTrain = stats.predict(model, train) RpredValid = stats.predict(model, valid) RpredTest = stats.predict(model, test) predTrain = h2o.get_frame( h2or.h2o_getId(RpredTrain.rx2("pred"))[0]) predValid = h2o.get_frame( h2or.h2o_getId(RpredValid.rx2("pred"))[0]) predTest = h2o.get_frame(h2or.h2o_getId(RpredTest.rx2("pred"))[0]) metrics_time = dt.now() - metrics_time row.append(init_time.total_seconds()) row.append(train_time.total_seconds()) row.append(metrics_time.total_seconds()) row.append((init_time + train_time + metrics_time).total_seconds()) datasets = [(RpredTrain, predTrain, train, training_frame), (RpredValid, predValid, valid, validation_frame), (RpredTest, predTest, test, test_frame)] append = row.append for pred_r_ptr, pred_py_ptr, data_r_ptr, data_py_ptr in datasets: acc = None err = None mse = ((pred_py_ptr - data_py_ptr[y])**2).mean()[0] if training_frame[y].isfactor()[0]: acc = (pred_py_ptr == data_py_ptr[y]).mean()[0] err = 1.0 - acc auc = cvauc.AUC( base.attr(pred_r_ptr.rx2("pred"), "data")[2], base.attr(data_r_ptr, "data").rx2(y))[0] # TODO: Add more metrics append(acc) append(err) append(None) # F1() append(None) # fnr() append(None) # fpr() append(None) # tnr() append(None) # tpr() append(None) # precision() append(None) # recall() append(None) # sensitivity() append(None) # specificity() append(None) # aic() append((auc)) # auc() append(None) # logloss() append(None) # mean_residual_deviance() append(mse) # mse() append(None) # null_degrees_of_freedom() append(None) # null_deviance() append(None) # r2() append(None) # residual_degrees_of_freedom() append(None) # residual_deviance() h2o.remove(pred_py_ptr) row = map( lambda x: None if isinstance(x, numbers.Number) and (x is None or np.isnan(x)) or x == u"NaN" or x == "NaN" else x, row) persist(row) results.append(row) for [frame] in h2o.ls().as_matrix(): if not keep_frames.match(frame): h2o.remove(frame) df = pd.DataFrame(results, columns=config.Names) return df
def fn_logistic(df_train, df_test): ############################# Import packages ############################# import os import numpy as np import pandas as pd # import pickle import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator from h2o.grid.grid_search import H2OGridSearch #category encoders from category_encoders import LeaveOneOutEncoder #needed for fn_computeRatiosOfNumerics() from itertools import permutations #stops the output of warnings when running models on test data which have different factor levels for categorical #data than on the train data. I am aware this is not best practice, but makes the output more readable import warnings warnings.filterwarnings('ignore') ################################ Functions ############################# def fn_MAE(actuals, predictions): return np.round(np.mean(np.abs(predictions - actuals))) def fn_RMSE(actuals, predictions): return np.round(np.sqrt(np.mean((predictions - actuals)**2))) def fn_tosplines(x): x = x.values # hack: remove zeros to avoid issues where lots of values are zero x_nonzero = x[x != 0] ptiles = np.percentile(x_nonzero, [10, 20, 40, 60, 80, 90]) ptiles = np.unique(ptiles) print(var, ptiles) df_ptiles = pd.DataFrame({var: x}) for idx, ptile in enumerate(ptiles): df_ptiles[var + '_' + str(idx)] = np.maximum(0, x - ptiles[idx]) return (df_ptiles) def fn_computeRatiosOfNumerics(df, variables): ## Process: # 1. Gets passed most important numeric variables # 2. Computes all pairwise ratios between each of these i.e # - get all permutations of length 2, and divide term 1 by term 2 # e. Returns a dataframe containing engineered variables, with appropriately named columns pairs = [] lst_series = [] for i in range(len(variables) + 1): for subset in permutations(variables, i): if len(subset) == 2: pairs.extend([subset]) temp_colnames = [] for elem in pairs: ## create column names temp_colname = 'ratio_{}.{}'.format(elem[0], elem[1]) temp_colnames.append(temp_colname) #compute ratio try: srs_pair_ratio = df[elem[0]] / df[elem[1]] except ZeroDivisionError: #if denominator is 0, will catch error and assign nan value to that ratio srs_pair_ratio = np.nan srs_pair_ratio = np.nan srs_pair_ratio.rename(temp_colname, inplace=True) lst_series.append(srs_pair_ratio) #create dataframe with appropriate column names df_2 = pd.DataFrame(index=df.index, columns=temp_colnames) #fill dataframe with series for idx, col in enumerate(df_2): df_2[col] = lst_series[idx] # Seems df division already catches ZeroDivisonError and assigns infinity value when denom = 0 but not numerator # In such case, want 0 coefficient. # Also want 0 coefficients when both numerator and denom are 0 # therefore replace all inf and nan values with zeroes df_2.replace([np.inf, -np.inf, np.nan], 0, inplace=True) return df_2 def fn_createInteractions(df, factors): ## takes as input a pandas dataframe, and a LIST of column names on which to create interactions #create an h2o frame h2o_df_temp = h2o.H2OFrame(df[factors], destination_frame='df_interactions_temp') #use H2OFrame.interaction(factors, pairwise, max_factors, min_occurence, destination_frame=None) h2o_df_temp = h2o_df_temp.interaction(factors, pairwise=True, max_factors=100, min_occurrence=1) return h2o_df_temp.as_data_frame(use_pandas=True) ################################ DEFINE VARIABLES ############################# vars_all = df_train.columns.values var_dep = ['target'] vars_notToUse = ['unique_id'] vars_ind = [ var for var in vars_all if var not in (vars_notToUse + var_dep) ] # find the categorical vars - this includes the hccv vars_ind_categorical = list( df_train.columns[df_train.dtypes == 'category']) # find numeric vars vars_ind_numeric = [ var for var in vars_ind if var not in vars_ind_categorical ] ## GET HCCV VARS ## If want to use some cardinality threshold other than 30, can edit threshold below: th_card = 30 srs_card = df_train[vars_ind_categorical].nunique() vars_ind_hccv = srs_card[srs_card > th_card].index.values.tolist( ) #stores names of categorical variables with cardinality higher than threshold # for convenience store dependent variable as y y = df_train[var_dep].values.ravel() ########################## Set index for train, val, design, test data ############################# #### Create folds to seperate train data into train, val, design, test rng = np.random.RandomState(2020) fold = rng.randint(0, 10, df_train.shape[0]) df_train['fold'] = fold #get indices for each subset idx_train = df_train['fold'].isin(range(8)) idx_val = df_train['fold'].isin([7, 8]) idx_design = df_train['fold'].isin(range(9)) #drop fold column as no longer needed (and want to maintain similar structure to df_test df_train.drop(columns='fold', inplace=True) ############################## **Start and connect the H2O JVM** ############################# # - Load the previous models in order to identify most important variables. To save time (and given that function can only take as input the train and test data), relevant code has been commented out but left in so that you may see my approach. I have instead hard-coded numeric and categorical variables I have found to be most important. # *Models are taking very long to run so have pre-loaded them below.* # - uncomment the below code to load the models but note that they must be in the PData directory # ### Connect to H2O cluster h2o.init( port=54321 ) # line left uncommented as I make use of H2O functions throughout the script # ### LOAD THE MODELS # # GLM basic, no interactions, no mean imputation for missing level values in test # # model name: GLM_model_basic # path_glm_basic = dirPData + 'GLM_model_basic' # # GLM basic, no interactions, WITH mean imputation for missing level values in test # # model name: GLM_model_basic_meanImpute # path_glm_basic_meanImpute = dirPData + 'GLM_model_basic_meanImpute' # # GLM numerical divisons, no interactions, WITH mean imputation for missing level values in test # # model name: GLM_model_numeric_meanImpute # path_glm_numeric_meanImpute = dirPData + 'GLM_model_numeric_meanImpute' # # GLM numerical divisons, with interactions, WITH mean imputation for missing level values in test # # model name: GLM_model_numeric_interactions_meanImpute # glm_basic = h2o.load_model(path = path_glm_basic) # glm_basic_meanImpute = h2o.load_model(path = path_glm_basic_meanImpute) # glm_numeric_meanImpute = h2o.load_model(path = path_glm_numeric_meanImpute) ############################## DEAL WITH MISSINGS ############################# #### IDENTIFY MISSINGS ## Check for missing numerics which have been replaced with -99 (placeholder, really it is missing) #get percentage of missing values for each feature srs_missing = pd.DataFrame(df_train.loc[:, :] == -99).sum( axis=0) / len(df_train) # print(srs_missing[srs_missing!=0]) #show which numerics have 'missing' placeholder values, and their percentage of missing values #get list of variables which have more than x% missing values #arbitrarily setting threshold to 50% but could tune this parameter if time permits missings_th = 0.5 many_missings = [ var for var in df_train.columns.values if srs_missing[var] >= missings_th ] ## DO NOT USE VARIABLES WITH MORE THAN x% MISSINGS #add vars from many_missings to vars_notToUse, remove them from list of numeric variables vars_notToUse.extend(many_missings) #turn into set and set back into list - deals with issue of duplicates when running code multiple time vars_notToUse = list(set(vars_notToUse)) #remove variables in many_missings from var_ind_numeric vars_ind_numeric = [ var for var in vars_ind_numeric if var not in vars_notToUse ] # print([var for var in vars_ind_numeric if var in vars_notToUse]) #double check they've been removed: printed list should be empty ### MEAN-IMPUTE MISSINGS # list of variables to impute vars_toImpute = [ var for var in srs_missing[srs_missing > 0].index.tolist() if var not in many_missings ] #get subset dataframe (only cols which are in variables_toImpute) #get only values != -99 -> this will mean that the missings will be returned as NaN. Can then use fillna df_temp = df_train[vars_toImpute][ df_train[vars_toImpute] != -99].copy() #make a working copy #use fillna: computing the mean of each column and filling NaNs with this mean value. df_temp.fillna(df_temp.mean(), inplace=True) df_train[vars_toImpute] = df_temp ############################## SPLINE HIGH CARDINALITY NUMERICS ############################# ## Attempt at capturing non-linear relationships in model ### Spline numeric variables with cardinality higher than 8 # define variables to spline vars_ind_tospline = df_train[vars_ind_numeric].columns[( df_train[vars_ind_numeric].nunique() > 8)].tolist() #Find the percentiles on train data only, then apply same percentiles to both train and test data, even if test data distribution is very different. #update df_train, df_test for var in vars_ind_tospline: df_ptiles = fn_tosplines(df_train[var]) df_train.drop(columns=[var], inplace=True) df_test.drop(columns=[var], inplace=True) vars_ind_numeric.remove(var) df_train = pd.concat([df_train, df_ptiles], axis=1, sort=False) df_test = pd.concat([df_test, df_ptiles], axis=1, sort=False) vars_ind_numeric.extend(df_ptiles.columns.tolist()) ############################## DEAL WITH HCCVs ############################# # - note that any modifications made to train data must also be made to test data (engineered colums etc) ### HCCV ENCODING USING category_encoders enc = LeaveOneOutEncoder(cols=vars_ind_hccv, sigma=0.3) enc.fit(df_train[idx_design], y[idx_design]) df_train = enc.transform(df_train) #encode hccvs in train data # df_train[vars_ind_hccv].head() df_test[ 'target'] = np.nan #add NaN target column to test dataset in order for it to have same shape as df_train df_test = enc.transform(df_test) #encode hccvs in test data df_test.drop(columns='target', inplace=True) #drop target column from df_test ############################## INTERACTIONS ############################# # - same applies here, whatever interactions are in train data must also be in test data ### DEFINE FIVE MOST IMPORTANT CATEGORICAL VARS ### NOTE: The below interactions are created based on the largest ### coefficients in a previously-run model. The code below identifies ### those coefficients by loading the model and manipulating the data. ### However, as assignment requires only input to be train and test ### datasets, the most important variables have been hardcoded in. ### Inspect coefficients from basic model with no interactions ## Plot standardised coefficients # glm_basic.std_coef_plot(num_of_features=10) ## Get list of 5 most important variables via varimp() # note that glm_basic.varimp() contains some onehots created by H2o on the fly when building the model, and thus some aren't actually present in the train/test frames # therefore can't refer to them before running a model, and we need to refer to the original variables before h2o onehots them # we extract these by: # - getting only the name of the variable and not its values i.e. var[0] for var in glm_basic.varimp() # - splitting on onehot delimiter '.' and keeping only first part of result. This is name of original variable # # Get list of FIVE most important categorical variables # vars_mostImp_cat=[] # for var in glm_basic.varimp(): # orig_var = var[0].split('.')[0] # if orig_var in vars_ind_categorical and orig_var not in vars_mostImp_cat: #check if numeric # #add to list of important categorical vars only if not already in list # vars_mostImp_cat.append(orig_var) # if len(vars_mostImp_cat)>= 5: # break vars_mostImp_cat = ['f09', 'f03', 'f07', 'f27', 'e11' ] #comment this line if uncommenting the above block #Get dataframe of interactions all pairwise interactions between five most important categorical variables df_train_interactions = fn_createInteractions(df_train, vars_mostImp_cat) df_test_interactions = fn_createInteractions(df_test, vars_mostImp_cat) #append new columns to df_train and df_test df_train[df_train_interactions.columns.values] = df_train_interactions df_test[df_test_interactions.columns.values] = df_test_interactions # include new numeric variables in vars_ind_numeric vars_ind_categorical.extend(df_train_interactions.columns.tolist()) ############################## OTHER FEATURES ############################# # DIVISON OF NUMERICS # - must also add engineered columns to test data ### DEFINE THREE MOST IMPORTANT NUMERICAL VARS ### NOTE: The below interactions are created based on the largest ### coefficients in a previously-run model. The code below identifies ### those coefficients by loading the model and manipulating the data. ### However, as assignment requires only input to be train and test ### datasets, the most important variables have been hardcoded in. # # plot largest standardised coefficients # # glm_basic.std_coef_plot(num_of_features=10) # # Get list of THREE most important variables # vars_mostImp_numeric=[] # for var in glm_basic.varimp(): # orig_var = var[0].split('.')[0] # if orig_var in vars_ind_numeric and orig_var not in vars_mostImp_numeric: #check if numeric # #add to list of important numeric vars # vars_mostImp_numeric.append(orig_var) # if len(vars_mostImp_numeric)>= 3: # break vars_mostImp_numeric = [ 'f11', 'f11_0', 'f11_1' ] #comment this line if uncommenting the above block ### COMPUTE RATIO COLUMNS FOR BOTH DATASETS df_temp_train = fn_computeRatiosOfNumerics(df_train, vars_mostImp_numeric) df_temp_test = fn_computeRatiosOfNumerics(df_test, vars_mostImp_numeric) #append new columns to df_train and df_test df_train[df_temp_train.columns.values] = df_temp_train df_test[df_temp_test.columns.values] = df_temp_test # include new numeric variables in vars_ind_numeric vars_ind_numeric.extend(df_temp_train.columns.tolist()) ############################## LOAD DATA INTO H2O JVM ############################# ### START JVM # h2o.init(port=54321) #commented as already connected to H2O cluster # h2o.connect(port=54321) ### Remove all data previously loaded (if any) in JVM as no longer need it for key in h2o.ls()['key']: h2o.remove(key) #### Create H2OFrames in H2O cluster for df_train, df_test h2o_df_train = h2o.H2OFrame(df_train[vars_ind_numeric + vars_ind_categorical + var_dep], destination_frame='df_train') h2o_df_test = h2o.H2OFrame(df_test[vars_ind_numeric + vars_ind_categorical], destination_frame='df_test') ### Change target to enum type as we are building a classification model # h2o_df_train[var_dep].types h2o_df_train[var_dep] = h2o_df_train[var_dep].asfactor() # h2o_df_train[var_dep].types ############################## DEFINE THE FEATURES TO BE USED ############################# features = vars_ind_numeric + vars_ind_categorical ###USE BOOLEAN MASKS TO INDEX TRAIN,VAL,DESIGN DATA idx_h2o_train = h2o.H2OFrame(idx_train.astype('int').values) idx_h2o_val = h2o.H2OFrame(idx_val.astype('int').values) idx_h2o_design = h2o.H2OFrame(idx_design.astype('int').values) ############################# MODELLING ############################# ### H2O GRIDSEARCH - hyper-parameter tuning # ## Will use random grid search rather than cartesian to save some time ### NOTE: The below code is commented out as it takes approximately 1h ### to run. After running, the best model was selected according to AUC ### and its corresponding hyper-parameters were recorded. These are ### hard-coded later on in a single GLM estimation, in order to estimate ### only the best model and save on computational time/resources. # ## GLM hyper parameters # lambda_opts = [16. * 2.**-i for i in np.arange(15)] # alpha_opts = [0, 0.5, 0.99] # glm_params = { # 'alpha': alpha_opts, # 'lambda': lambda_opts # } # search_criteria = { # 'strategy': 'RandomDiscrete', # 'max_runtime_secs': 3600 # } # ## Train and validate a random grid of GLMs # ##According to H2O documentation, must use logit link as we are estimating a binomial classification model. # glm_grid = H2OGridSearch( # model=H2OGeneralizedLinearEstimator( # family='binomial', # link='logit', # nfolds=10, # seed=2020, # keep_cross_validation_models=False, # keep_cross_validation_predictions=False, # keep_cross_validation_fold_assignment=False, # missing_values_handling='mean_imputation' # ) # , grid_id='glm_grid' # , hyper_params=glm_params # , search_criteria=search_criteria # # , parallelism = 0 #adaptive parallelism, decided by H2O # ) # glm_grid.train(x=features, # y='target', # training_frame=h2o_df_train[idx_h2o_design, :], # seed=2020) # ## Get the grid results, sorted by validation AUC # glm_grid_performance = glm_grid.get_grid(sort_by='auc', decreasing=True) # glm_grid_performance ############################### best model results ########################### # # alpha lambda model_ids auc # # # #0 [0.0] [9.765625E-4] glm_grid_model_38 0.8595786171889577 # # ############################################################################## ### ESTIMATE GLM via H2O, using hyper-params found through grid-search # We set family to bimonial as we are running a classification GLM model (with only two classes). # According to H2O documentation, must use logit link as we are estimating a binomial classification model. # missing_values_handling -> MeanImputation: deals with new sample having categorical levels not seen in training. Replaces the unseen value with the most frequent level present in TRAINING SET. # keep_cross_valudation_* -> set to false to save some memory in H2o cluster. model = H2OGeneralizedLinearEstimator( alpha=0.00, family='binomial', link='logit', lambda_=9.765625E-4, nfolds=10, seed=2020, keep_cross_validation_models=False, keep_cross_validation_predictions=False, keep_cross_validation_fold_assignment=False, missing_values_handling='mean_imputation') print('Estimating GLM model...' ) #notification of progress when running function model.train(x=features, y='target', training_frame=h2o_df_train[idx_h2o_design, :]) ### NOTE: This model is run using hard-coded values of alpha and lambda. ### These are the ones corresponding to the best model found via grid ### search above. Computation (wall) time: 3min 3s ### Save the model dirPData = '../PData/' dirPOutput = '../POutput/' best_glm = model best_glm_path = h2o.save_model(model=best_glm, path=dirPData, force=True) print(best_glm_path) ## MAKE PREDICTIONS ON TEST DATASET temp_preds = best_glm.predict(h2o_df_test) ### Export predictions to kaggle-required format df_test['Predicted'] = np.round(temp_preds[2].as_data_frame(), 5) df_preds = df_test[['unique_id', 'Predicted']].copy() df_test[['unique_id', 'Predicted']].to_csv(dirPOutput + 'best_glm_250k.csv', index=False) #### KAGGLE AUCROC PUBLIC LEADERBOARD SCORE: 0.80162 ### SHUT DOWN H2O CLUSTER # h2o.cluster().shutdown() #not shutting down cluster as not sure if this will cause issues when returning the handle to the h2o object ############################### END OF FUNCTION, RETURN ########################### # - trained H2OGeneralizedLinearEstimator object # - Test data fed to object when making predictions: handle to H2OFrame object # - Kaggle public leaderboard score, hardcoded as 3 dp return [best_glm, h2o_df_test, 0.802]
import h2o h2o.init() datasets = "https://raw.githubusercontent.com/DarrenCook/h2o/bk/datasets/" data = h2o.import_file(datasets + "iris_wheader.csv") data.frame_id data = data[:, 1:] data.frame_id data = h2o.assign(data, "iris") data.frame_id h2o.ls() h2o.remove("iris_wheader.hex") h2o.ls()
def pubdev_6603(): hf = h2o.H2OFrame(pd.DataFrame([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])) s1, s2 = hf.split_frame(ratios=[0.5], seed=1) h2o.remove([hf, s1, s2]) assert len(h2o.ls()) == 0
def weights_check(): def check_same(data1, data2, min_rows_scale): gbm1_regression = h2o.gbm(x=data1[[ "displacement", "power", "weight", "acceleration", "year" ]], y="economy", training_frame=data1, min_rows=5, ntrees=5, max_depth=5) gbm2_regression = h2o.gbm(x=data2[[ "displacement", "power", "weight", "acceleration", "year", "weights" ]], y=data2["economy"], min_rows=5 * min_rows_scale, weights_column=data2["weights"], ntrees=5, max_depth=5) gbm1_binomial = h2o.gbm(x=data1[[ "displacement", "power", "weight", "acceleration", "year" ]], y=data1["economy_20mpg"], min_rows=5, distribution="bernoulli", ntrees=5, max_depth=5) gbm2_binomial = h2o.gbm(x=data2[[ "displacement", "power", "weight", "acceleration", "year", "weights" ]], y=data2["economy_20mpg"], weights_column="weights", training_frame=data2, min_rows=5 * min_rows_scale, distribution="bernoulli", ntrees=5, max_depth=5) gbm1_multinomial = h2o.gbm(x=data1[[ "displacement", "power", "weight", "acceleration", "year" ]], y=data1["cylinders"], min_rows=5, distribution="multinomial", ntrees=5, max_depth=5) gbm2_multinomial = h2o.gbm(x=data2[[ "displacement", "power", "weight", "acceleration", "year", "weights" ]], y=data2["cylinders"], weights_column="weights", training_frame=data2, min_rows=5 * min_rows_scale, distribution="multinomial", ntrees=5, max_depth=5) reg1_mse = gbm1_regression.mse() reg2_mse = gbm2_regression.mse() bin1_auc = gbm1_binomial.auc() bin2_auc = gbm2_binomial.auc() mul1_mse = gbm1_multinomial.mse() mul2_mse = gbm2_multinomial.mse() print("MSE (regresson) no weights vs. weights: {0}, {1}".format( reg1_mse, reg2_mse)) print("AUC (binomial) no weights vs. weights: {0}, {1}".format( bin1_auc, bin2_auc)) print("MSE (multinomial) no weights vs. weights: {0}, {1}".format( mul1_mse, mul2_mse)) assert abs( reg1_mse - reg2_mse ) < 1e-6 * reg1_mse, "Expected mse's to be the same, but got {0}, and {1}".format( reg1_mse, reg2_mse) assert abs( bin1_auc - bin2_auc ) < 3e-4 * bin1_auc, "Expected auc's to be the same, but got {0}, and {1}".format( bin1_auc, bin2_auc) assert abs( mul1_mse - mul1_mse ) < 1e-6 * mul1_mse, "Expected auc's to be the same, but got {0}, and {1}".format( mul1_mse, mul2_mse) h2o_cars_data = h2o.import_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) h2o_cars_data["economy_20mpg"] = h2o_cars_data["economy_20mpg"].asfactor() h2o_cars_data["cylinders"] = h2o_cars_data["cylinders"].asfactor() # uniform weights same as no weights random.seed(2222) weight = 3 # random.randint(1,10) FIXME: RNG changed from py2->3, so force in the rando behavior from 2 uniform_weights = [[weight]] * 406 h2o_uniform_weights = h2o.H2OFrame(uniform_weights) h2o_uniform_weights.set_names(["weights"]) h2o_data_uniform_weights = h2o_cars_data.cbind(h2o_uniform_weights) print("Checking that using uniform weights is equivalent to no weights:") print() check_same(h2o_cars_data, h2o_data_uniform_weights, weight) # zero weights same as removed observations zero_weights = [[ 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]] # zero_weights = [[0 if random.randint(0,1) else 1 for r in range(406)]] print("ZERO WEIGHTS: " + str(zero_weights)) h2o_zero_weights = h2o.H2OFrame(list(zip(*zero_weights))) h2o_zero_weights.set_names(["weights"]) h2o_data_zero_weights = h2o_cars_data.cbind(h2o_zero_weights) h2o_data_zeros_removed = h2o_cars_data[h2o_zero_weights["weights"] == 1] print( "Checking that using some zero weights is equivalent to removing those observations:" ) print() check_same(h2o_data_zeros_removed, h2o_data_zero_weights, 1) # doubled weights same as doubled observations doubled_weights = [[ 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 1, 1 ]] #[[1 if random.randint(0,1) else 2 for r in range(406)]] print("DOUBLED WEIGHTS: " + str(doubled_weights)) h2o_doubled_weights = h2o.H2OFrame(list(zip(*doubled_weights))) h2o_doubled_weights.set_names(["weights"]) print(h2o_cars_data.head()) h2o_data_doubled_weights = h2o_cars_data.cbind(h2o_doubled_weights) print(h2o.ls()) doubled_data = h2o.as_list(h2o_cars_data, use_pandas=False) colnames = doubled_data.pop(0) for idx, w in enumerate(doubled_weights[0]): if w == 2: doubled_data.append(doubled_data[idx]) h2o_data_doubled = h2o.H2OFrame(doubled_data) h2o_data_doubled.set_names(list(colnames)) h2o_data_doubled["economy_20mpg"] = h2o_data_doubled[ "economy_20mpg"].asfactor() h2o_data_doubled["cylinders"] = h2o_data_doubled["cylinders"].asfactor() h2o_data_doubled_weights["economy_20mpg"] = h2o_data_doubled_weights[ "economy_20mpg"].asfactor() h2o_data_doubled_weights["cylinders"] = h2o_data_doubled_weights[ "cylinders"].asfactor() print( "Checking that doubling some weights is equivalent to doubling those observations:" ) print() check_same(h2o_data_doubled, h2o_data_doubled_weights, 1)
def execute(self, name, x, y, training_frame, validation_frame, test_frame, subset_coef): params = grid.ParameterGrid(self.params_grid) if self.params_grid == None or len(self.params_grid) == 0: params = ["default"] results = [] dt = datetime.datetime keep_frames = re.compile("|".join([ training_frame.frame_id, validation_frame.frame_id, test_frame.frame_id ]) + "|.*\\.hex|py_.*") for p in params: row = [ config.cluster, config.nthreads, name, subset_coef, self.name, str(p) ] # Initialize the model init_time = dt.now() if p == "default": model = self.base_model() else: model = self.base_model(**p) init_time = dt.now() - init_time # Train the model train_time = dt.now() model.train(x, y, training_frame=training_frame, validation_frame=validation_frame) train_time = dt.now() - train_time # Model metrics metrics_time = dt.now() metrics = model.model_performance(test_data=test_frame) err_tr = get_classification_error(model, training_frame, "train") err_va = get_classification_error(model, validation_frame, "valid") err_te = get_classification_error(metrics, test_frame, "test") metrics_time = dt.now() - metrics_time # results row.append(init_time.total_seconds()) row.append(train_time.total_seconds()) row.append(metrics_time.total_seconds()) row.append((init_time + train_time + metrics_time).total_seconds()) # on training data appendVal(row, lambda: 1 - err_tr) appendVal(row, lambda: err_tr) appendVal(row, lambda: model.F1()) appendVal(row, lambda: model.fnr()) appendVal(row, lambda: model.fpr()) appendVal(row, lambda: model.tnr()) appendVal(row, lambda: model.tpr()) appendVal(row, lambda: model.precision()) appendVal(row, lambda: model.recall()) appendVal(row, lambda: model.sensitivity()) appendVal(row, lambda: model.specificity()) appendVal(row, lambda: model.aic()) appendVal(row, lambda: model.auc()) appendVal(row, lambda: model.logloss()) appendVal(row, lambda: model.mean_residual_deviance()) appendVal(row, lambda: model.mse()) appendVal(row, lambda: model.null_degrees_of_freedom()) appendVal(row, lambda: model.null_deviance()) appendVal(row, lambda: model.r2()) appendVal(row, lambda: model.residual_degrees_of_freedom()) appendVal(row, lambda: model.residual_deviance()) # on validation data appendVal(row, lambda: 1 - err_va) appendVal(row, lambda: err_va) appendVal(row, lambda: model.F1(valid=True)) appendVal(row, lambda: model.fnr(valid=True)) appendVal(row, lambda: model.fpr(valid=True)) appendVal(row, lambda: model.tnr(valid=True)) appendVal(row, lambda: model.tpr(valid=True)) appendVal(row, lambda: model.precision(valid=True)) appendVal(row, lambda: model.recall(valid=True)) appendVal(row, lambda: model.sensitivity(valid=True)) appendVal(row, lambda: model.specificity(valid=True)) appendVal(row, lambda: model.aic(valid=True)) appendVal(row, lambda: model.auc(valid=True)) appendVal(row, lambda: model.logloss(valid=True)) appendVal(row, lambda: model.mean_residual_deviance(valid=True)) appendVal(row, lambda: model.mse(valid=True)) appendVal(row, lambda: model.null_degrees_of_freedom(valid=True)) appendVal(row, lambda: model.null_deviance(valid=True)) appendVal(row, lambda: model.r2(valid=True)) appendVal(row, lambda: model.residual_degrees_of_freedom(valid=True)) appendVal(row, lambda: model.residual_deviance(valid=True)) # on test data appendVal(row, lambda: 1 - err_te) appendVal(row, lambda: err_te) appendVal(row, lambda: metrics.F1()) appendVal(row, lambda: metrics.fnr()) appendVal(row, lambda: metrics.fpr()) appendVal(row, lambda: metrics.tnr()) appendVal(row, lambda: metrics.tpr()) appendVal(row, lambda: metrics.precision()) appendVal(row, lambda: metrics.recall()) appendVal(row, lambda: metrics.sensitivity()) appendVal(row, lambda: metrics.specificity()) appendVal(row, lambda: metrics.aic()) appendVal(row, lambda: metrics.auc()) appendVal(row, lambda: metrics.logloss()) appendVal(row, lambda: metrics.mean_residual_deviance()) appendVal(row, lambda: metrics.mse()) appendVal(row, lambda: metrics.null_degrees_of_freedom()) appendVal(row, lambda: metrics.null_deviance()) appendVal(row, lambda: metrics.r2()) appendVal(row, lambda: metrics.residual_degrees_of_freedom()) appendVal(row, lambda: metrics.residual_deviance()) row = map( lambda x: None if isinstance(x, numbers.Number) and (x is None or np.isnan(x)) or x == u"NaN" or x == "NaN" else x, row) persist(row) results.append(row) for [frame] in h2o.ls().as_matrix(): if not keep_frames.match(frame): h2o.remove(frame) df = pd.DataFrame(results, columns=config.Names) return df
def ls_test(ip, port): iris = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv")) h2o.ls()
def weights_check(): def check_same(data1, data2, min_rows_scale): gbm1_regression = h2o.gbm(x=data1[["displacement", "power", "weight", "acceleration", "year"]], y="economy", training_frame=data1, min_rows=5, ntrees=5, max_depth=5) gbm2_regression = h2o.gbm(x=data2[["displacement", "power", "weight", "acceleration", "year", "weights"]], y=data2["economy"], min_rows=5*min_rows_scale, weights_column=data2["weights"], ntrees=5, max_depth=5) gbm1_binomial = h2o.gbm(x=data1[["displacement", "power", "weight", "acceleration", "year"]], y=data1["economy_20mpg"], min_rows=5, distribution="bernoulli", ntrees=5, max_depth=5) gbm2_binomial = h2o.gbm(x=data2[["displacement", "power", "weight", "acceleration", "year", "weights"]], y=data2["economy_20mpg"], weights_column="weights", training_frame=data2, min_rows=5*min_rows_scale, distribution="bernoulli", ntrees=5, max_depth=5) gbm1_multinomial = h2o.gbm(x=data1[["displacement", "power", "weight", "acceleration", "year"]], y=data1["cylinders"], min_rows=5, distribution="multinomial", ntrees=5, max_depth=5) gbm2_multinomial = h2o.gbm(x=data2[["displacement", "power", "weight", "acceleration", "year", "weights"]], y=data2["cylinders"], weights_column="weights", training_frame=data2, min_rows=5*min_rows_scale, distribution="multinomial", ntrees=5, max_depth=5) reg1_mse = gbm1_regression.mse() reg2_mse = gbm2_regression.mse() bin1_auc = gbm1_binomial.auc() bin2_auc = gbm2_binomial.auc() mul1_mse = gbm1_multinomial.mse() mul2_mse = gbm2_multinomial.mse() print("MSE (regresson) no weights vs. weights: {0}, {1}".format(reg1_mse, reg2_mse)) print("AUC (binomial) no weights vs. weights: {0}, {1}".format(bin1_auc, bin2_auc)) print("MSE (multinomial) no weights vs. weights: {0}, {1}".format(mul1_mse, mul2_mse)) assert abs(reg1_mse - reg2_mse) < 1e-6 * reg1_mse, "Expected mse's to be the same, but got {0}, and {1}".format(reg1_mse, reg2_mse) assert abs(bin1_auc - bin2_auc) < 3e-4 * bin1_auc, "Expected auc's to be the same, but got {0}, and {1}".format(bin1_auc, bin2_auc) assert abs(mul1_mse - mul1_mse) < 1e-6 * mul1_mse, "Expected auc's to be the same, but got {0}, and {1}".format(mul1_mse, mul2_mse) h2o_cars_data = h2o.import_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) h2o_cars_data["economy_20mpg"] = h2o_cars_data["economy_20mpg"].asfactor() h2o_cars_data["cylinders"] = h2o_cars_data["cylinders"].asfactor() # uniform weights same as no weights random.seed(2222) weight = 3 # random.randint(1,10) FIXME: RNG changed from py2->3, so force in the rando behavior from 2 uniform_weights = [[weight]*406] h2o_uniform_weights = h2o.H2OFrame(uniform_weights) h2o_uniform_weights.set_names(["weights"]) h2o_data_uniform_weights = h2o_cars_data.cbind(h2o_uniform_weights) print("Checking that using uniform weights is equivalent to no weights:") print() check_same(h2o_cars_data, h2o_data_uniform_weights, weight) # zero weights same as removed observations zero_weights = [[1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]] # zero_weights = [[0 if random.randint(0,1) else 1 for r in range(406)]] print("ZERO WEIGHTS: " + str(zero_weights)) h2o_zero_weights = h2o.H2OFrame(zero_weights) h2o_zero_weights.set_names(["weights"]) h2o_data_zero_weights = h2o_cars_data.cbind(h2o_zero_weights) h2o_data_zeros_removed = h2o_cars_data[h2o_zero_weights["weights"] == 1] print("Checking that using some zero weights is equivalent to removing those observations:") print() check_same(h2o_data_zeros_removed, h2o_data_zero_weights, 1) # doubled weights same as doubled observations doubled_weights = [[2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 1, 1]] #[[1 if random.randint(0,1) else 2 for r in range(406)]] print("DOUBLED WEIGHTS: " + str(doubled_weights)) h2o_doubled_weights = h2o.H2OFrame(doubled_weights) h2o_doubled_weights.set_names(["weights"]) print(h2o_cars_data.head()) h2o_data_doubled_weights = h2o_cars_data.cbind(h2o_doubled_weights) print(h2o.ls()) doubled_data = h2o.as_list(h2o_cars_data, use_pandas=False) doubled_data = list(zip(*doubled_data)) colnames = doubled_data.pop(0) for idx, w in enumerate(doubled_weights[0]): if w == 2: doubled_data.append(doubled_data[idx]) doubled_data = list(zip(*doubled_data)) h2o_data_doubled = h2o.H2OFrame(doubled_data) h2o_data_doubled.set_names(list(colnames)) h2o_data_doubled["economy_20mpg"] = h2o_data_doubled["economy_20mpg"].asfactor() h2o_data_doubled["cylinders"] = h2o_data_doubled["cylinders"].asfactor() h2o_data_doubled_weights["economy_20mpg"] = h2o_data_doubled_weights["economy_20mpg"].asfactor() h2o_data_doubled_weights["cylinders"] = h2o_data_doubled_weights["cylinders"].asfactor() print("Checking that doubling some weights is equivalent to doubling those observations:") print() check_same(h2o_data_doubled, h2o_data_doubled_weights, 1)