def expr_math_ops(ip, port): # Connect to h2o h2o.init(ip, port) sin_cos_tan_atan_sinh_cosh_tanh_asinh_data = [ [random.uniform(-10, 10) for r in range(10)] for c in range(10) ] asin_acos_atanh_data = [[random.uniform(-1, 1) for r in range(10)] for c in range(10)] acosh_data = [[random.uniform(1, 10) for r in range(10)] for c in range(10)] abs_data = [[random.uniform(-100000, 0) for r in range(10)] for c in range(10)] h2o_data1_1 = h2o.H2OFrame( python_obj=sin_cos_tan_atan_sinh_cosh_tanh_asinh_data) h2o_data2_1 = h2o.H2OFrame(python_obj=asin_acos_atanh_data) h2o_data3_1 = h2o.H2OFrame(python_obj=acosh_data) h2o_data4_1 = h2o.H2OFrame(python_obj=abs_data) np_data1 = np.array(sin_cos_tan_atan_sinh_cosh_tanh_asinh_data) np_data2 = np.array(asin_acos_atanh_data) np_data3 = np.array(acosh_data) np_data4 = np.array(abs_data) h2o_data1 = h2o_data1_1 + 2 h2o_data2 = h2o_data2_1 / 1.01 h2o_data3 = h2o_data3_1 * 1.5 h2o_data4 = h2o_data4_1 - 1.5 np_data1 = np_data1 + 2 np_data2 = np_data2 / 1.01 np_data3 = np_data3 * 1.5 np_data4 = np_data4 - 1.5 h2o.np_comparison_check(h2o_data1.cos(), np.cos(np_data1), 10) h2o.np_comparison_check(h2o_data1.sin(), np.sin(np_data1), 10) h2o.np_comparison_check(h2o_data1.tan(), np.tan(np_data1), 10) h2o.np_comparison_check(h2o_data2.acos(), np.arccos(np_data2), 10) h2o.np_comparison_check(h2o_data2.asin(), np.arcsin(np_data2), 10) h2o.np_comparison_check(h2o_data1.atan(), np.arctan(np_data1), 10) h2o.np_comparison_check(h2o_data1.cosh(), np.cosh(np_data1), 10) h2o.np_comparison_check(h2o_data1.sinh(), np.sinh(np_data1), 10) h2o.np_comparison_check(h2o_data1.tanh(), np.tanh(np_data1), 10) h2o.np_comparison_check(h2o_data3.acosh(), np.arccosh(np_data3), 10) h2o.np_comparison_check(h2o_data1.asinh(), np.arcsinh(np_data1), 10) h2o.np_comparison_check(h2o_data2.atanh(), np.arctanh(np_data2), 10) h2o.np_comparison_check((h2o_data2 / math.pi).cospi(), np.cos(np_data2), 10) h2o.np_comparison_check((h2o_data2 / math.pi).sinpi(), np.sin(np_data2), 10) h2o.np_comparison_check((h2o_data2 / math.pi).tanpi(), np.tan(np_data2), 10) h2o.np_comparison_check(h2o_data4.abs(), np.fabs(np_data4), 10) h2o.np_comparison_check(h2o_data2.sign(), np.sign(np_data2), 10) h2o.np_comparison_check(h2o_data3.sqrt(), np.sqrt(np_data3), 10) h2o.np_comparison_check(h2o_data3.trunc(), np.trunc(np_data3), 10) h2o.np_comparison_check(h2o_data3.ceil(), np.ceil(np_data3), 10) h2o.np_comparison_check(h2o_data3.floor(), np.floor(np_data3), 10) h2o.np_comparison_check(h2o_data3.log(), np.log(np_data3), 10) h2o.np_comparison_check(h2o_data3.log10(), np.log10(np_data3), 10) h2o.np_comparison_check(h2o_data3.log1p(), np.log1p(np_data3), 10) h2o.np_comparison_check(h2o_data3.log2(), np.log2(np_data3), 10) h2o.np_comparison_check(h2o_data3.exp(), np.exp(np_data3), 10) h2o.np_comparison_check(h2o_data3.expm1(), np.expm1(np_data3), 10) h2o_val = h2o_data3.gamma()[5, 5] num_val = math.gamma(h2o_data3[5, 5]) assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \ "check unsuccessful! h2o computed {0} and math computed {1}. expected equal gamma values between h2o and " \ "math".format(h2o_val,num_val) h2o_val = h2o_data3.lgamma()[5, 5] num_val = math.lgamma(h2o_data3[5, 5]) assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \ "check unsuccessful! h2o computed {0} and math computed {1}. expected equal lgamma values between h2o and " \ "math".\ format(h2o_val,num_val) h2o_val = h2o_data3.digamma()[5, 5] num_val = scipy.special.polygamma(0, h2o_data3[5, 5]) assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \ "check unsuccessful! h2o computed {0} and math computed {1}. expected equal digamma values between h2o and " \ "math"\ .format(h2o_val,num_val) h2o_val = h2o_data3.trigamma()[5, 5] num_val = float(scipy.special.polygamma(1, h2o_data3[5, 5])) assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \ "check unsuccessful! h2o computed {0} and math computed {1}. expected equal trigamma values between h2o and " \ "math".format(h2o_val,num_val)
# Load data sets pd_train = pd.read_csv('na_filled.csv') pd_labels = pd.read_csv('dataset/dengue_labels_train.csv') pd_test = pd.read_csv('dataset/dengue_features_test.csv') pd_submit = pd.read_csv('dataset/submission_format.csv') # Identifying columns response_column = 'total_cases' training_columns = list(pd_test.columns) # Merging labels with training data pd_train[response_column] = pd_labels[response_column] # Create h2o frames hd_train = h2o.H2OFrame(pd_train) hd_train.set_names(list(pd_train.columns)) hd_test = h2o.H2OFrame(pd_test) hd_test.set_names(list(pd_test.columns)) h2o.export_file(frame=hd_train, path='h2o_train.csv', force=True) h2o.export_file(frame=hd_test, path='h2o_test.csv', force=True) # Defining machine learning model # model = H2ODeepLearningEstimator(epochs=100, hidden=[128, 128, 128], nfolds=10) model = H2ORandomForestEstimator(ntrees=100, max_depth=20, binomial_double_trees=True) # Train model model.train(x=training_columns, y=response_column, training_frame=hd_train)
print(len(anomaly_series)) # Remove anomalies df = pData.drop(pData.index[anomaly_series]) # Feature engineering data_frame = ProcessData.trainDataToFrame(df, moving_k_closest_average=True, standard_deviation=True, moving_median=True) testing_frame = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True, moving_median=True) # Create h2o frame hData = h2o.H2OFrame(data_frame) hData.set_names(list(data_frame.columns)) hTesting = h2o.H2OFrame(testing_frame) hTesting.set_names(list(testing_frame.columns)) # Split data inti training and validation hTrain, hValidate = hData.split_frame(ratios=[0.8]) h2o.export_file(hTrain, "hTrainMy.csv", force=True) h2o.export_file(hValidate, "hValidateMy.csv", force=True) training_columns = list(pData.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL')
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): X = dt.Frame(X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = None if isinstance(self, H2ONBModel): # NB can only handle weights of 0 / 1 if sample_weight is not None: sample_weight = (sample_weight != 0).astype(int) if sample_weight_eval_set is not None: sample_weight_eval_set = [(sample_weight_eval_set[0] != 0).astype(int)] train_X = h2o.H2OFrame(X.to_pandas()) self.col_types = train_X.types train_y = h2o.H2OFrame(y, column_names=[self.target], column_types=['categorical' if self.num_classes >= 2 else 'numeric']) train_frame = train_X.cbind(train_y) if sample_weight is not None: train_w = h2o.H2OFrame(sample_weight, column_names=[self.weight], column_types=['numeric']) train_frame = train_frame.cbind(train_w) valid_frame = None valid_X = None valid_y = None model = None if eval_set is not None: valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types) valid_y = h2o.H2OFrame(eval_set[0][1], column_names=[self.target], column_types=['categorical' if self.num_classes >= 2 else 'numeric']) valid_frame = valid_X.cbind(valid_y) if sample_weight is not None: if sample_weight_eval_set is None: sample_weight_eval_set = [np.ones(len(eval_set[0][1]))] valid_w = h2o.H2OFrame(sample_weight_eval_set[0], column_names=[self.weight], column_types=['numeric']) valid_frame = valid_frame.cbind(valid_w) try: train_kwargs = dict() params = copy.deepcopy(self.params) if not isinstance(self, H2OAutoMLModel): # AutoML needs max_runtime_secs in initializer, all others in train() method max_runtime_secs = params.pop('max_runtime_secs') train_kwargs = dict(max_runtime_secs=max_runtime_secs) if valid_frame is not None: train_kwargs['validation_frame'] = valid_frame if sample_weight is not None: train_kwargs['weights_column'] = self.weight model = self.make_instance(**params) # Don't ever use the offset column as a feature offset_col = None # if no column is called offset we will pass "None" and not use this feature cols_to_train = [] # list of all non-offset columns for col in list(train_X.names): if not col.lower() == "offset": cols_to_train.append(col) else: offset_col = col orig_cols = cols_to_train # not training on offset # Models that can use an offset column if isinstance(model, H2OGBMModel) | isinstance(model, H2ODLModel) | isinstance(model, H2OGLMModel): model.train(x=cols_to_train, y=self.target, training_frame=train_frame, offset_column=offset_col, **train_kwargs) else: model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs) if isinstance(model, H2OAutoML): model = model.leader self.id = model.model_id model_path = os.path.join(user_dir(), "h2o_model." + str(uuid.uuid4())) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: raw_model_bytes = f.read() finally: if model_path is not None: remove(model_path) for xx in [train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y]: if xx is not None: if isinstance(xx, H2OAutoML): h2o.remove(xx.project_name) else: h2o.remove(xx) df_varimp = model.varimp(True) if df_varimp is None: varimp = np.ones(len(orig_cols)) else: df_varimp.index = df_varimp['variable'] df_varimp = df_varimp.iloc[:, 1] # relative importance for missing in [x for x in orig_cols if x not in list(df_varimp.index)]: # h2o3 doesn't handle raw strings all the time, can hit: # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]" df_varimp[missing] = 0 varimp = df_varimp[orig_cols].values # order by fitted features varimp = np.nan_to_num(varimp) self.set_model_properties(model=raw_model_bytes, features=orig_cols, importances=varimp, iterations=self.get_iterations(model))
import h2o from h2o.estimators import H2ODeepLearningEstimator from dataprocessor import ProcessData from h2o.estimators import H2ORandomForestEstimator from h2o.grid import H2OGridSearch # Initialize server h2o.init() data = ProcessData.trainData(moving_k_closest_average=True, standard_deviation=True, probability_distribution=True) hData = h2o.H2OFrame(data) hData.set_names(list(data.columns)) training_columns = list(data.columns) training_columns.remove('RUL') training_columns.remove('UnitNumber') training_columns.remove('Time') # hyper_parameters = {'ntrees': [10, 50], 'max_depth': [20, 10]} # grid_search = H2OGridSearch(H2ORandomForestEstimator, hyper_params=hyper_parameters) # grid_search.train(x=training_columns, y='RUL', training_frame=hData) # grid_search.show() # models = grid_search.sort_by("mse") # print models hyper_parameters = {
"J": 0, "SI1": 3, "VS2": 4, "SI2": 2, "VS1": 5, "VVS2": 6, "VVS1": 7, "IF": 8, "I1": 1 } train.replace(ord_cut, inplace=True) test.replace(ord_cut, inplace=True) # Using h2o syntax, we import the datasets as h2o train = h2o.H2OFrame(train) test = h2o.H2OFrame(test) # Set train-test dataframes y = "price" x = train.columns x.remove(y) train_final, valid = train.split_frame( ratios=[0.8]) # We make the split of training and validation modelo = H2OGradientBoostingEstimator( ntrees=440, learn_rate=0.5531490180631663, max_depth=10, #sample_rate: 0.6117256495829282,
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): X = dt.Frame(X) h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model_path = None orig_cols = list(X.names) y = self.make_y_nonnegative(y) train_X = h2o.H2OFrame(X.to_pandas()) self.col_types = train_X.types train_y = h2o.H2OFrame( y, column_names=[self.target], column_types=[ 'categorical' if self.num_classes >= 2 else 'numeric' ]) train_frame = train_X.cbind(train_y) if sample_weight is not None: train_w = h2o.H2OFrame(sample_weight, column_names=[self.weight], column_types=['numeric']) train_frame = train_frame.cbind(train_w) valid_frame = None valid_X = None valid_y = None model = None if eval_set is not None: valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types) valid_y = eval_set[0][1] valid_y = self.make_y_nonnegative(valid_y) valid_y = h2o.H2OFrame( valid_y, column_names=[self.target], column_types=[ 'categorical' if self.num_classes >= 2 else 'numeric' ]) valid_frame = valid_X.cbind(valid_y) if sample_weight is not None: if sample_weight_eval_set is None: sample_weight_eval_set = [np.ones(len(eval_set[0][1]))] valid_w = h2o.H2OFrame(sample_weight_eval_set[0], column_names=[self.weight], column_types=['numeric']) valid_frame = valid_frame.cbind(valid_w) try: train_kwargs = dict() max_runtime_secs = self.params.get('max_runtime_secs', 0) train_kwargs = dict(max_runtime_secs=max_runtime_secs) if valid_frame is not None: train_kwargs['validation_frame'] = valid_frame if sample_weight is not None: train_kwargs['weights_column'] = self.weight model = self.make_instance(**self.params) model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs) self.id = model.model_id model_path = os.path.join(user_dir(), "h2o_model." + str(uuid.uuid4())) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: raw_model_bytes = f.read() finally: if model_path is not None: remove(model_path) for xx in [ train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y ]: if xx is not None: h2o.remove(xx) df_varimp = model.varimp(True) if df_varimp is None: varimp = np.ones(len(orig_cols)) else: df_varimp.index = df_varimp['variable'] df_varimp = df_varimp.iloc[:, 1] # relative importance for missing in [ x for x in orig_cols if x not in list(df_varimp.index) ]: # h2o3 doesn't handle raw strings all the time, can hit: # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]" df_varimp[missing] = 0 varimp = df_varimp[orig_cols].values # order by fitted features varimp = np.nan_to_num(varimp) self.set_model_properties(model=raw_model_bytes, features=orig_cols, importances=varimp, iterations=self.get_iterations(model))
sys.path.insert(1, 'J:\\AFL') import afl_functions as afl import pandas as pd import h2o import math fixture = afl.get_fixtureAFL() fixture.to_csv( 'J:\\AFL\\ToScore.csv', index=False) #check to make sure this file is for the round required from h2o.automl import H2OAutoML h2o.init() model_path = 'J:\\AFL\\ModelGiving422018trainedON20112017\\StackedEnsemble_AllModels_0_AutoML_20181106_085551' saved_model = h2o.load_model(model_path) scoring_data = afl.get_data(season_from=2019, season_to=2019, proxy=False, train_mode=False) hf_score = h2o.H2OFrame(scoring_data) prediction = saved_model.predict(hf_score) prediction = prediction.as_data_frame(use_pandas=True) prediction.to_csv('J:\\AFL\\PredictionRun.csv') scoring_data.to_csv('J:\\AFL\\temp.csv')
def weights_check(ip, port): def check_same(data1, data2, min_rows_scale): gbm1_regression = h2o.gbm(x=data1[[ "displacement", "power", "weight", "acceleration", "year" ]], y="economy", training_frame=data1, min_rows=5, ntrees=5, max_depth=5) gbm2_regression = h2o.gbm(x=data2[[ "displacement", "power", "weight", "acceleration", "year", "weights" ]], y=data2["economy"], min_rows=5 * min_rows_scale, weights_column=data2["weights"], ntrees=5, max_depth=5) gbm1_binomial = h2o.gbm(x=data1[[ "displacement", "power", "weight", "acceleration", "year" ]], y=data1["economy_20mpg"], min_rows=5, distribution="bernoulli", ntrees=5, max_depth=5) gbm2_binomial = h2o.gbm(x=data2[[ "displacement", "power", "weight", "acceleration", "year", "weights" ]], y=data2["economy_20mpg"], weights_column="weights", training_frame=data2, min_rows=5 * min_rows_scale, distribution="bernoulli", ntrees=5, max_depth=5) gbm1_multinomial = h2o.gbm(x=data1[[ "displacement", "power", "weight", "acceleration", "year" ]], y=data1["cylinders"], min_rows=5, distribution="multinomial", ntrees=5, max_depth=5) gbm2_multinomial = h2o.gbm(x=data2[[ "displacement", "power", "weight", "acceleration", "year", "weights" ]], y=data2["cylinders"], weights_column="weights", training_frame=data2, min_rows=5 * min_rows_scale, distribution="multinomial", ntrees=5, max_depth=5) reg1_mse = gbm1_regression.mse() reg2_mse = gbm2_regression.mse() bin1_auc = gbm1_binomial.auc() bin2_auc = gbm2_binomial.auc() mul1_mse = gbm1_multinomial.mse() mul2_mse = gbm2_multinomial.mse() print "MSE (regresson) no weights vs. weights: {0}, {1}".format( reg1_mse, reg2_mse) print "AUC (binomial) no weights vs. weights: {0}, {1}".format( bin1_auc, bin2_auc) print "MSE (multinomial) no weights vs. weights: {0}, {1}".format( mul1_mse, mul2_mse) assert abs( reg1_mse - reg2_mse ) < 1e-6 * reg1_mse, "Expected mse's to be the same, but got {0}, and {1}".format( reg1_mse, reg2_mse) assert abs( bin1_auc - bin2_auc ) < 1e-6 * bin1_auc, "Expected auc's to be the same, but got {0}, and {1}".format( bin1_auc, bin2_auc) assert abs( mul1_mse - mul1_mse ) < 1e-6 * mul1_mse, "Expected auc's to be the same, but got {0}, and {1}".format( mul1_mse, mul2_mse) h2o_cars_data = h2o.import_file( h2o.locate("smalldata/junit/cars_20mpg.csv")) h2o_cars_data["economy_20mpg"] = h2o_cars_data["economy_20mpg"].asfactor() h2o_cars_data["cylinders"] = h2o_cars_data["cylinders"].asfactor() # uniform weights same as no weights random.seed(2222) weight = random.randint(1, 10) uniform_weights = [[weight] for r in range(406)] h2o_uniform_weights = h2o.H2OFrame(python_obj=uniform_weights) h2o_uniform_weights.setNames(["weights"]) h2o_data_uniform_weights = h2o_cars_data.cbind(h2o_uniform_weights) print "Checking that using uniform weights is equivalent to no weights:" print check_same(h2o_cars_data, h2o_data_uniform_weights, weight) # zero weights same as removed observations zero_weights = [[0] if random.randint(0, 1) else [1] for r in range(406)] h2o_zero_weights = h2o.H2OFrame(python_obj=zero_weights) h2o_zero_weights.setNames(["weights"]) h2o_data_zero_weights = h2o_cars_data.cbind(h2o_zero_weights) h2o_data_zeros_removed = h2o_cars_data[h2o_zero_weights["weights"] == 1] print "Checking that using some zero weights is equivalent to removing those observations:" print check_same(h2o_data_zeros_removed, h2o_data_zero_weights, 1) # doubled weights same as doubled observations doubled_weights = [[1] if random.randint(0, 1) else [2] for r in range(406)] h2o_doubled_weights = h2o.H2OFrame(python_obj=doubled_weights) h2o_doubled_weights.setNames(["weights"]) h2o_data_doubled_weights = h2o_cars_data.cbind(h2o_doubled_weights) doubled_data = h2o.as_list(h2o_cars_data, use_pandas=False) colnames = doubled_data.pop(0) for idx, w in enumerate(doubled_weights): if w[0] == 2: doubled_data.append(doubled_data[idx]) h2o_data_doubled = h2o.H2OFrame(python_obj=doubled_data) h2o_data_doubled.setNames(colnames) h2o_data_doubled["economy_20mpg"] = h2o_data_doubled[ "economy_20mpg"].asfactor() h2o_data_doubled["cylinders"] = h2o_data_doubled["cylinders"].asfactor() h2o_data_doubled_weights["economy_20mpg"] = h2o_data_doubled_weights[ "economy_20mpg"].asfactor() h2o_data_doubled_weights["cylinders"] = h2o_data_doubled_weights[ "cylinders"].asfactor() print "Checking that doubling some weights is equivalent to doubling those observations:" print check_same(h2o_data_doubled, h2o_data_doubled_weights, 1)
def interactions_GLM_Binomial(): # test multiple interactions_GLM_Binomial enum by enum, enum by num and num by num all with NA terms print("******* Test interaction pairs") pd_df_NA = pd.DataFrame(np.array( [[1, 0, 1, 0, 1, 0], [1, 2, 4.2 / 2.2, 4, 3, 1], [2, 3, float('NaN'), 1, 2, 3], ["a", "a", "a", "b", "a", "b"], ['Foo', 'UNKNOWN', 'Foo', 'Foo', 'Foo', 'Bar']]).T, columns=[ 'label', 'numerical_feat', 'numerical_feat2', 'categorical_feat', 'categorical_feat2' ]) h2o_df_NA = h2o.H2OFrame(pd_df_NA, na_strings=["UNKNOWN"]) pd_df = pd.DataFrame(np.array([[1, 0, 1, 0, 1, 0], [1, 2, 4.2 / 2.2, 4, 3, 1], [2, 3, 2.2, 1, 2, 3], ["a", "a", "a", "b", "a", "b"], ['Foo', 'Foo', 'Foo', 'Foo', 'Foo', 'Bar']]).T, columns=[ 'label', 'numerical_feat', 'numerical_feat2', 'categorical_feat', 'categorical_feat2' ]) h2o_df = h2o.H2OFrame(pd_df, na_strings=["UNKNOWN"]) interaction_pairs = [("numerical_feat", "numerical_feat2"), ("numerical_feat", "categorical_feat2"), ("categorical_feat", "categorical_feat2")] xcols = [ 'numerical_feat', 'numerical_feat2', 'categorical_feat', 'categorical_feat2' ] # build model with and without NA in Frame modelNA = H2OGeneralizedLinearEstimator( family="Binomial", alpha=0, lambda_search=False, interaction_pairs=interaction_pairs, standardize=False) modelNA.train(x=xcols, y='label', training_frame=h2o_df_NA) # build model with and without NA in Frame model = H2OGeneralizedLinearEstimator(family="Binomial", alpha=0, lambda_search=False, interaction_pairs=interaction_pairs, standardize=False) model.train(x=xcols, y='label', training_frame=h2o_df) assert_arrays_equal_NA( modelNA._model_json['output']['coefficients_table'].cell_values, model._model_json['output']['coefficients_table'].cell_values) # test interaction of num and num columns print("******* Test interaction with num by num") pd_df_num_num_NA = pd.DataFrame( np.array([[1, 0, 1, 0], [1, 2, 2, 4], [2, 3, float('NaN'), 1]]).T, columns=['label', 'numerical_feat', 'numerical_feat2']) pd_df_num_num = pd.DataFrame( np.array([[1, 0, 1, 0], [1, 2, 2, 4], [2, 3, 2, 1]]).T, columns=['label', 'numerical_feat', 'numerical_feat2']) performOneTest(pd_df_num_num_NA, pd_df_num_num, interactionColumn=['numerical_feat', 'numerical_feat2'], xcols=['numerical_feat', 'numerical_feat2'], standard=False) # test interaction of enum and enum columns print("******* Test interaction with enum by enum") pd_df_cat_cat_NA = pd.DataFrame( np.array([[1, 0, 1, 0], ["a", "a", "b", "b"], ['Foo', 'UNKNOWN', 'Foo', 'Bar']]).T, columns=['label', 'categorical_feat', 'categorical_feat2']) pd_df_cat_cat = pd.DataFrame( np.array([[1, 0, 1, 0], ["a", "a", "b", "b"], ['Foo', 'Foo', 'Foo', 'Bar']]).T, columns=['label', 'categorical_feat', 'categorical_feat2']) performOneTest(pd_df_cat_cat_NA, pd_df_cat_cat, interactionColumn=['categorical_feat', 'categorical_feat2'], xcols=['categorical_feat', 'categorical_feat2']) # test interaction of enum and num columns print("******* Test interaction with enum by num") pd_df_cat_num_NA = pd.DataFrame( np.array([[1, 0, 1, 0], [1, 2, 3, 4], ['Foo', 'UNKNOWN', 'Foo', 'Bar']]).T, columns=['label', 'numerical_feat', 'categorical_feat']) pd_df_cat_num = pd.DataFrame( np.array([[1, 0, 1, 0], [1, 2, 3, 4], ['Foo', 'Foo', 'Foo', 'Bar']]).T, columns=['label', 'numerical_feat', 'categorical_feat']) performOneTest(pd_df_cat_num_NA, pd_df_cat_num, interactionColumn=['numerical_feat', 'categorical_feat'], xcols=['numerical_feat', 'categorical_feat'])
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): X = dt.Frame(X) h2o.init(port=config.h2o_recipes_port) model_path = None orig_cols = list(X.names) train_X = h2o.H2OFrame(X.to_pandas()) train_y = h2o.H2OFrame( y, column_names=[self.target], column_types=[ 'categorical' if self.num_classes >= 2 else 'numeric' ]) train_frame = train_X.cbind(train_y) valid_frame = None valid_X = None valid_y = None model = None if eval_set is not None: valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas()) valid_y = h2o.H2OFrame( eval_set[0][1], column_names=[self.target], column_types=[ 'categorical' if self.num_classes >= 2 else 'numeric' ]) valid_frame = valid_X.cbind(valid_y) try: model = self.make_instance() model.train(x=train_X.names, y=self.target, training_frame=train_frame, validation_frame=valid_frame) self.id = model.model_id model_path = os.path.join(temporary_files_path, "h2o_model." + str(uuid.uuid4())) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: raw_model_bytes = f.read() finally: if model_path is not None: os.remove(model_path) for xx in [ train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y ]: if xx is not None: h2o.remove(xx) df_varimp = model.varimp(True) if df_varimp is None: varimp = np.ones(len(orig_cols)) else: df_varimp.index = df_varimp['variable'] df_varimp = df_varimp.iloc[:, 1] # relative importance varimp = df_varimp[orig_cols].values # order by fitted features self.set_model_properties(model=raw_model_bytes, features=orig_cols, importances=varimp, iterations=self.get_iterations(model))