def H2OBuildModel(self): weather = "serm.csv" weather_df = h2o.import_file(path=weather) global model global test train,test,valid = weather_df.split_frame(ratios=(.7, .15)) estimator_index = self.tabWidget_PM_Estimator.currentIndex() if estimator_index == 0: _distribution = self.comboBox_PM_distribution.currentText() _activation = self.comboBox_PM_activation.currentText() _hidden = self.comboBox_PM_hidden.currentText() _epochs = self.spinBox_PM_epochs.value() _sparse = self.comboBox_PM_sparse.currentText() _shuffle = self.comboBox_PM_shuffle.currentText() model = H2ODeepLearningEstimator(distribution=_distribution,activation=_activation,hidden=_hidden,shuffle = _shuffle,sparse=_sparse,epochs=_epochs) self.completed = 0 while self.completed < 100: self.completed += 0.0001 self.progressBar.setValue(self.completed) model.train(y="risk", x=["datetime","ffwi","smoke","temperature", "humidity", "windspeed"], training_frame=train) metrics = model.model_performance() self.lineEdit_PM_MSE.setText(str(round(metrics['MSE'],5))) self.lineEdit_PM_RMSE.setText(str(round(metrics['RMSE'],5))) self.lineEdit_PM_MAE.setText(str(round(metrics['mae'],5))) self.lineEdit_PM_MRD.setText(str(round(metrics['mean_residual_deviance'],5)))
def bake(self) -> H2ODeepLearningEstimator: fr = titanic_frame() fr["parch"] = fr["parch"].asfactor() model = H2ODeepLearningEstimator(epochs=50, reproducible=True) model.train(y="parch", training_frame=fr, ignored_columns=["name", "ticket", "boat", "home.dest"]) return model
def bake(self) -> H2ODeepLearningEstimator: fr = names_frame() fr = fr[:5000, :] fr["name"] = fr["name"].ascharacter().asfactor() # trim nlevels() assert 256 < fr["name"].nlevels()[0] < 500 model = H2ODeepLearningEstimator(epochs=100, reproducible=True) model.train(y="sex", training_frame=fr) return model
def bake(self) -> H2ODeepLearningEstimator: fr = stars_frame() assert fr.type("distance") == "int" model = H2ODeepLearningEstimator(epochs=100, reproducible=True) model.train(y="distance", training_frame=fr, ignored_columns=["name1", "name2"]) return model
def demo_body(go): """ Demo of H2O's Deep Learning model. This demo uploads a dataset to h2o, parses it, and shows a description. Then it divides the dataset into training and test sets, builds a GLM from the training set, and makes predictions for the test set. Finally, default performance metrics are displayed. """ go() # Connect to H2O h2o.init() go() # Upload the prostate dataset that comes included in the h2o python package prostate = h2o.upload_file(data_file("h2o_data/prostate.csv")) go() # Print a description of the prostate data prostate.summary() go() # Randomly split the dataset into ~70/30, training/test sets r = prostate[0].runif() train = prostate[r < 0.70] test = prostate[r >= 0.70] go() # Convert the response columns to factors (for binary classification problems) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() go() # Build a (classification) GLM from h2o.estimators import H2ODeepLearningEstimator prostate_dl = H2ODeepLearningEstimator(activation="Tanh", hidden=[10, 10, 10], epochs=10000) prostate_dl.train(x=list(set(prostate.col_names) - {"ID", "CAPSULE"}), y="CAPSULE", training_frame=train) go() # Show the model prostate_dl.show() go() # Predict on the test set and show the first ten predictions predictions = prostate_dl.predict(test) predictions.show() go() # Show default performance metrics performance = prostate_dl.model_performance(test) performance.show()
def final_train(self, train: pd.DataFrame, valid: pd.DataFrame): train_hex = h2o.H2OFrame(train) valid_hex = h2o.H2OFrame(valid) self.listCheckpointsNN = list() counter = 1 for model in self.listNNModels: id = model.model_id name = str(model.model_id) + str(counter) model_chkp = H2ODeepLearningEstimator( checkpoint=model.model_id, model_id=name, activation=model.actual_params.get("acivation"), training_frame=train_hex, validation_frame=valid_hex, # x=self.predictors, # y=self.response, stopping_tolerance=1e-4, stopping_rounds=3, mini_batch_size=24, epochs=1e6, hidden=model.actual_params.get("hidden"), rate=model.actual_params.get("rate"), rate_annealing=model.actual_params.get("rate_annealing"), distribution=model.actual_params.get("distribution"), categorical_encoding=model.actual_params.get( "categorical_encoding"), standardize=model.actual_params.get("standardize"), adaptive_rate=model.actual_params.get("adaptive_rate"), nesterov_accelerated_gradient=model.actual_params.get( "nesterov_accelerated_gradient"), shuffle_training_data=model.actual_params.get( "shuffle_training_data"), stopping_metric=model.actual_params.get("stopping_metric"), train_samples_per_iteration=0, score_validation_samples= 0, ## downsample validation set for faster scoring# score_duty_cycle= 0.025, ## don't score more than 2.5% of the wall time max_w2=model.actual_params.get( "max_w2") ## can help improve stability for Rectifier ) model_chkp.train(x=self.predictors, y=self.response, training_frame=train_hex, validation_frame=valid_hex) counter = counter + 1 self.listCheckpointsNN.append(model_chkp) #self.listCheckpointsNN.append(model) self.listNNModels = self.listCheckpointsNN NNH2o.print_model_params(self.listNNModels, False)
def test(x, y, output_test, strip_part, algo_name, generic_algo_name): airlines = h2o.import_file( path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) gbm = H2ODeepLearningEstimator(epochs=1) gbm.train(x=x, y=y, training_frame=airlines, validation_frame=airlines) print(gbm) with Capturing() as original_output: gbm.show() original_model_filename = tempfile.mkdtemp() original_model_filename = gbm.download_mojo(original_model_filename) key = h2o.lazy_import(original_model_filename) fr = h2o.get_frame(key[0]) generic_mojo_model = H2OGenericEstimator(model_key=fr) generic_mojo_model.train() compare_params(gbm, generic_mojo_model) print(generic_mojo_model) with Capturing() as generic_output: generic_mojo_model.show() output_test(str(original_output), str(generic_output), strip_part, algo_name, generic_algo_name) predictions = generic_mojo_model.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert generic_mojo_model._model_json["output"][ "model_summary"] is not None assert len(generic_mojo_model._model_json["output"] ["model_summary"]._cell_values) > 0 # Test constructor generating the model from existing MOJO file generic_mojo_model_from_file = H2OGenericEstimator.from_file( original_model_filename) assert generic_mojo_model_from_file is not None predictions = generic_mojo_model_from_file.predict(airlines) assert predictions is not None assert predictions.nrows == 24421 assert generic_mojo_model_from_file._model_json["output"][ "model_summary"] is not None assert len(generic_mojo_model_from_file._model_json["output"] ["model_summary"]._cell_values) > 0 generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo") generic_mojo_filename = generic_mojo_model_from_file.download_mojo( path=generic_mojo_filename) assert os.path.getsize(generic_mojo_filename) == os.path.getsize( original_model_filename)
# Set mapper df_mapper = DataFrameMapper([(training_columns, None), (response_column, None)]) # Test data - pandas to sklearn test_tmp = df_mapper.fit_transform(testing_frame) # [row : column] column_count = len(test_tmp[0, :]) # ground truth tY = np.array(testing_frame['RUL']) # Building model model1 = H2ODeepLearningEstimator(hidden=[200, 200], score_each_iteration=False, variable_importances=True) model2 = H2ODeepLearningEstimator(hidden=[200, 200], score_each_iteration=False, variable_importances=True) model3 = H2ODeepLearningEstimator(hidden=[200, 200], score_each_iteration=False, variable_importances=True) model4 = H2ODeepLearningEstimator(hidden=[200, 200], score_each_iteration=False, variable_importances=True) model5 = H2ODeepLearningEstimator(hidden=[200, 200], score_each_iteration=False, variable_importances=True) # train model
training_columns.remove(response_column) training_columns.remove("UnitNumber") training_columns.remove("Time") # split frames train, validate = hTrain.split_frame([_validation_ratio]) test = hTest ground_truth = np.array(pTest['RUL']) # Building model model_arr = list(range(_nmodels)) print("Building models") print("---------------") for i in range(_nmodels): model_arr[i] = H2ODeepLearningEstimator(hidden=[200, 200], score_each_iteration=True, variable_importances=True) print("Build model complete...\n") print("Train models") print("------------") for i in range(_nmodels): print("Train : " + str(i + 1) + "/" + str(_nmodels)) model_arr[i].train(x=training_columns, y=response_column, training_frame=train) print("Train model complete...\n") print("Validate models") print("---------------") mse_val = np.zeros(shape=_nmodels) for i in range(_nmodels): mse_val[i] = model_arr[i].mse(model_arr[i].model_performance(test_data=validate)) print("Validation model complete...\n")
# Remove anomalies p_filtered = p_train.drop(p_train.index[rm_index]) # Convert pandas to H2OFrame h_data = h2o.H2OFrame(p_filtered) h_data.set_names(list(p_data.columns)) # DeepLearning model training and validation h_train, h_validate = h_data.split_frame(ratios=[_vr_model]) # Extract ground truth data ground_truth_data = np.array(p_test[response_column]) # Define columns dl_train_columns = list(p_filtered.columns) rm_columns = ['RUL', 'UnitNumber', 'Time'] for column in rm_columns: dl_train_columns.remove(column) model = H2ODeepLearningEstimator(epochs=100, loss='Automatic', activation='RectifierWithDropout', distribution='poisson', hidden=[512]) model.train(x=dl_train_columns, y=response_column, training_frame=h_train, validation_frame=h_validate) performance = model.model_performance(test_data=h_test) print(performance)
import h2o from h2o.estimators import H2ODeepLearningEstimator h2o.init() train = h2o.import_file('dataset/train.csv') test = h2o.import_file('dataset/test.csv') # define columns response_column = 'SalePrice' training_columns = train.col_names training_columns.remove(response_column) #train[training_columns].describe() #OverallQual, OverallCond, GrLivArea model = H2ODeepLearningEstimator(nfolds=10, epochs=100, hidden=[500, 500]) model.train(x=training_columns, y=response_column, training_frame=train) h2o.export_file(frame=model.predict(test_data=test), path='prediction.csv', force=True) print model.model_performance()
training_columns = list(pData.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') response_column = 'RUL' hyper_parameters = { 'activation': [ 'tanh', 'tanh_with_dropout', 'rectifier', 'rectifier_with_dropout', 'maxout', 'maxout_with_dropout' ], 'distribution': [ 'auto', 'bernoulli', 'multinomial', 'gaussian', 'poisson', 'gamma', 'tweedie', 'laplace', 'quantile', 'huber' ], 'epochs': [100], 'hidden': [512], 'loss': ['automatic'] } grid_search = H2OGridSearch(H2ODeepLearningEstimator(nfold=10), hyper_params=hyper_parameters) grid_search.train(x=training_columns, y='RUL', training_frame=hTrain, validation_frame=hValidate) grid_search.show() models = grid_search.sort_by("mse") print(models)
def binary_class(self, type, target, duplicated, sep, exclude, max_runtime_secs): img = plt.figure() self.write_image(img, 'blank', width=600, height=500) self.gstep(0, "Reading Dataset") buffer = io.StringIO() self.dfo.columns = [c.replace(' ', '_') for c in self.dfo.columns] self.gstep(1, "Verify if duplicated") self.insert_text( "shape", str(self.dfo.shape[0]) + ' / ' + str(self.dfo.shape[1])) self.get_classes(self.dfo, target) self.insert_text("nclasses", str(self.nclasses)) self.insert_text("allclasses", str(self.allclasses)) shape_before = self.dfo.shape[0] if duplicated: self.dfo = self.dfo.drop_duplicates(self.dfo.columns) shape_after = self.dfo.shape[0] if shape_before == shape_after: self.insert_text("duplicated", "none") else: self.insert_text("duplicated", str(shape_after - shape_before)) if exclude != 'none': self.dfo.drop(columns=exclude, inplace=True) self.gstep(1, "Detecting hi frequency features") exclude = self.hi_freq(self.dfo) self.dfo.drop(columns=exclude['Feature'], inplace=True) hi_freq = self.w_table(data=exclude, border=0, align='left', collapse='collapse', color='black', foot=False) self.insert_text("excluded", hi_freq) self.gstep(1, "Encoding as sort_by_response") self.dfo_encode = self.encode(self.dfo.copy()) self.gstep(1, "Basic Informations") df_info = pd.DataFrame() for column in self.dfo.columns: not_null = int(self.dfo.shape[0] - int(self.dfo[column].isna().sum())) dtype = self.dfo[column].dtypes df_info = df_info.append( { 'column': column, 'not_null': not_null, 'dtype': dtype }, ignore_index=True) df_info['not_null'] = df_info['not_null'].apply(lambda x: int(x)) df_info['percent'] = df_info['not_null'].apply( lambda x: float("{:.4f}".format(1 - (x / self.dfo.shape[0])))) info_dataset = self.w_table(data=df_info, border=0, align='left', collapse='collapse', color='black', foot=False) self.insert_text("info_dataset", info_dataset) self.gstep(1, "Computing Regression") Y = self.dfo_encode[target] dfo_num = self.dfo_encode[self.dfo_encode._get_numeric_data().columns] X = dfo_num.drop(columns=[target]) # Criando os dados de train e test X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42) cols = X.columns formule = " + ".join(map(str, cols)) formule = target + " ~ " + formule reg = smf.ols(formule, data=dfo_num) res = reg.fit() self.insert_text('regression', str(res.summary())) self.gstep(1, "Unbalance Classes") temp = self.dfo[target].value_counts() df = pd.DataFrame({target: temp.index, 'values': temp.values}) plt.figure(figsize=(6, 6)) plt.title('Data Set - target value - data unbalance\n (' + target + ')') sns.set_color_codes("pastel") sns.barplot(x=target, y="values", data=df) locs, labels = plt.xticks() self.write_image(plt, "unbalance", width=500, height=350, crop=True) self.gstep(1, "Correlation") plt.clf() corr = self.dfo_encode.corr() mask = np.zeros_like(corr, dtype=bool) mask[np.triu_indices_from(mask)] = True cmap = sns.diverging_palette(230, 20, as_cmap=True) plt.figure(figsize=(8, 8)) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0, annot=True, square=True, linewidths=1.5, cbar_kws={"shrink": .5}) self.write_image(plt, "corr", width=0, height=0, crop=True) self.gstep(1, "Detecting Multicollinearity with VIF") y = self.dfo_encode[target] y = y.apply(lambda x: 1 if x == 'yes' else 0) X = self.dfo_encode.drop(target, axis=1) X = X[X._get_numeric_data().columns] X = X.fillna(0) X = X.dropna() vif = [ variance_inflation_factor(X.values, i) for i in range(X.shape[1]) ] cols = X.columns cols = cols[cols != target] df_m = pd.DataFrame({'cols': cols, 'vif': vif}) df_m['significant'] = '' df_m['significant'] = df_m['vif'].apply(self.parse_values) m_vif = self.w_table(data=df_m, border=0, align='left', collapse='collapse', color='black', foot=False) self.insert_text("vif", str(m_vif)) i = 2 text = '' text2 = '' for column in self.dfo.columns: feature = self.dfo[column].describe() text = text + '<option value="' + str( i) + '"> ' + column + ' </option>n\t\t\t\t\t\t\t\t' text2 = text2 + "\n\t\t\t\t\t\t\t\t\t\t} else if (selectedValue == '" + str( i ) + "') {\n\t\t\t\t\t\t\t\tdivElement.innerHTML = '" + pd.DataFrame( feature).to_html().replace('\n', '') + "';\n\t\t\t\t\t\t\t\t" i = i + 1 text2 = text2 + '\n\t\t\t\t\t\t\t\t};' self.insert_text('vif_desc_option', text) self.insert_text('vif_desc_table', text2) self.gstep(1, "Residual Analisys") plt.clf() model = Ridge() visualizer = ResidualsPlot(model, hist=False, qqplot=True) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) self.write_image(plt, "residual1", width=500, height=350, crop=True) plt.clf() visualizer = ResidualsPlot(model, hist=True, qqplot=False) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) self.write_image(plt, "residual2", width=500, height=350, crop=True) self.gstep(1, "Initializing H2O") h2o.init() self.gstep(1, "Parsing Data Frame") df = h2o.H2OFrame(self.dfo_encode) self.gstep(1, "Trainning Auto Machine Learning") train, valid, test = df.split_frame(ratios=[0.7, 0.2], seed=1234) x = train.columns y = target x.remove(y) train[y] = train[y].asfactor() test[y] = test[y].asfactor() aml = H2OAutoML(max_models=20, max_runtime_secs=max_runtime_secs, seed=1, include_algos=[ "GLM", "DeepLearning", "DRF", "xGBoost", "StackedEnsemble" ], balance_classes=True) aml.train(x=x, y=y, training_frame=train) lb = h2o.automl.get_leaderboard(aml, extra_columns='ALL') lb = lb.as_data_frame() lb = lb.drop(columns=['rmse', 'mse', 'predict_time_per_row_ms']) text = self.w_table(lb) self.insert_text('auto_ml_results', text) self.write_image(aml.varimp_heatmap(), 'var_imp_model', width=450, height=400, crop=True) self.gstep(1, "AML - Partial Dependence") i = 101 text = '' text2 = '' for column in tqdm(self.dfo.columns): feature = self.dfo[column].describe() text = text + '<option value="' + str( i) + '"> ' + column + ' </option>n\t\t\t\t\t\t\t\t' text2 = text2 + "\n\t\t\t\t\t\t\t\t\t\t} else if (selectedValue2 == '" + str( i ) + "'){\n\t\t\t\t\t\t\t\tdivElement2.innerHTML = '<img src=\"images/img_aml_pd_" + str( i) + ".png\">';\n\t\t\t\t\t\t\t\t" self.write_image(aml.pd_multi_plot(valid, column), 'aml_pd_' + str(i), width=600, height=500) i = i + 1 text2 = text2 + '\n\t\t\t\t\t\t\t\t};' self.insert_text('aml_pd_option', text) self.insert_text('aml_pd_image', text2) self.gstep(1, "Trainning (GLM) Gradient Linear Model to Ensemble") nfolds = 5 family = "binomial" amlr_glm = H2OGeneralizedLinearEstimator( family=family, nfolds=nfolds, lambda_=0, max_runtime_secs=max_runtime_secs, balance_classes=True, fold_assignment="Modulo", compute_p_values=True, keep_cross_validation_predictions=True, remove_collinear_columns=True) amlr_glm.train(x, y, training_frame=train) self.gstep(1, "Trainning (DRF) Dynamic Random Forest to Ensemble") amlr_rf = H2ORandomForestEstimator( ntrees=50, nfolds=nfolds, fold_assignment="Modulo", max_runtime_secs=max_runtime_secs, balance_classes=True, keep_cross_validation_predictions=True, seed=1) amlr_rf.train(x=x, y=y, training_frame=train) self.gstep( 1, "Trainning (GBM) Gradient Boost Estimator Model to Ensemble") amlr_gbm = H2OGradientBoostingEstimator( nfolds=nfolds, seed=1111, balance_classes=True, fold_assignment="Modulo", max_runtime_secs=max_runtime_secs, keep_cross_validation_predictions=True) amlr_gbm.train(x=x, y=y, training_frame=train) self.gstep(1, "Trainning xGBoost Model to Ensemble") amlr_xgb = H2OXGBoostEstimator(booster='dart', nfolds=nfolds, normalize_type="tree", fold_assignment="Modulo", max_runtime_secs=max_runtime_secs, keep_cross_validation_predictions=True, seed=1234) amlr_xgb.train(x=x, y=y, training_frame=train, validation_frame=valid) self.gstep(1, "Trainning Deep Learning Model to Ensemble") family = "bernoulli" dl_model = H2ODeepLearningEstimator(distribution=family, hidden=[1], epochs=1000, train_samples_per_iteration=-1, reproducible=True, activation="Tanh", single_node_mode=False, balance_classes=True, force_load_balance=False, seed=23123, tweedie_power=1.5, max_runtime_secs=max_runtime_secs, score_training_samples=0, score_validation_samples=0, stopping_rounds=0) dl_model.train(x=x, y=y, training_frame=train) self.gstep(1, "Trainning Ensemble") ensemble = H2OStackedEnsembleEstimator( model_id="amlr_ensemble", base_models=[amlr_gbm, amlr_rf, amlr_xgb, amlr_glm]) ensemble.train(x=x, y=y, training_frame=train) i = 201 text = '' text2 = '' self.gstep(1, "Ensamble - (ICE) Individual Condition Expectation") for column in tqdm(self.dfo.columns): feature = self.dfo[column].describe() text = text + '<option value="' + str( i) + '"> ' + column + ' </option>n\t\t\t\t\t\t\t\t' text2 = text2 + "\n\t\t\t\t\t\t\t\t\t\t} else if (selectedValue3 == '" + str( i ) + "'){\n\t\t\t\t\t\t\t\tdivElement3.innerHTML = '<img src=\"images/img_ice_pd_" + str( i) + ".png\">';\n\t\t\t\t\t\t\t\t" self.write_image(ensemble.ice_plot(valid, column), 'ice_pd_' + str(i), width=600, height=500) i = i + 1 text2 = text2 + '\n\t\t\t\t\t\t\t\t};' self.insert_text('ice_pd_option', text) self.insert_text('ice_pd_image', text2) self.gstep(1, "AMLR - Correlation by Model") self.write_image(aml.model_correlation_heatmap(test), 'aml_correlation_models') self.gstep(1, "Processing Models Performance") i = 0 dfp = pd.DataFrame({'Algo': []}) outcome = list(valid[target].as_data_frame()[target]) for algo in [ 'GLM', 'Random Forest', 'GBM', 'xGBoost', 'Deep Learning' ]: plt.clf() if algo == 'GLM': predict = list( amlr_glm.predict(valid).as_data_frame()['predict']) cf_table = 'cf_glm' cm_glm = ConfusionMatrix(outcome, predict) glm_var_imp = amlr_glm._model_json['output'][ 'variable_importances'].as_data_frame() x = glm_var_imp['percentage'] x.index = glm_var_imp['variable'] x.sort_values().plot(kind='barh') plt.xlabel('Percentage') fig = plt.gcf() self.write_image(fig, 'fi_glm', width=450, height=450) if algo == 'Random Forest': predict = list( amlr_rf.predict(valid).as_data_frame()['predict']) cf_table = 'cf_rf' cm_rf = ConfusionMatrix(outcome, predict) rf_var_imp = amlr_rf._model_json['output'][ 'variable_importances'].as_data_frame() x = rf_var_imp['percentage'] x.index = rf_var_imp['variable'] x.sort_values().plot(kind='barh') plt.xlabel('Percentage') fig = plt.gcf() self.write_image(fig, 'fi_rf', width=450, height=450) if algo == 'GBM': predict = list( amlr_gbm.predict(valid).as_data_frame()['predict']) cf_table = 'cf_gbm' cm_gbm = ConfusionMatrix(outcome, predict) gbm_var_imp = amlr_gbm._model_json['output'][ 'variable_importances'].as_data_frame() x = gbm_var_imp['percentage'] x.index = gbm_var_imp['variable'] x.sort_values().plot(kind='barh') plt.xlabel('Percentage') fig = plt.gcf() self.write_image(fig, 'fi_gbm', width=450, height=450) if algo == 'xGBoost': predict = list( amlr_xgb.predict(valid).as_data_frame()['predict']) cf_table = 'cf_xgb' cm_xgb = ConfusionMatrix(outcome, predict) xgb_var_imp = amlr_xgb._model_json['output'][ 'variable_importances'].as_data_frame() x = xgb_var_imp['percentage'] x.index = xgb_var_imp['variable'] x.sort_values().plot(kind='barh') plt.xlabel('Percentage') fig = plt.gcf() self.write_image(fig, 'fi_xgb', width=450, height=450) if algo == 'Deep Learning': predict = list( dl_model.predict(valid).as_data_frame()['predict']) cf_table = 'cf_dl' cm_dl = ConfusionMatrix(outcome, predict) dl_var_imp = dl_model._model_json['output'][ 'variable_importances'].as_data_frame() x = dl_var_imp['percentage'] x.index = dl_var_imp['variable'] x.sort_values().plot(kind='barh') plt.xlabel('Percentage') fig = plt.gcf() self.write_image(fig, 'fi_dl', width=450, height=450) # Confusion Matrix for all models cm = confusion_matrix(predict, outcome) cm = pd.DataFrame(cm) cr = classification_report(outcome, predict, target_names=self.allclasses, output_dict=True) table_cr = pd.DataFrame(cr).transpose().round(4) table_cr.reset_index(level=0, inplace=True) table_cr = table_cr.rename(columns={'index': 'Description'}) table_model = self.w_table(data=table_cr, border=0, align='left', collapse='collapse', color='black', foot=False) self.insert_text(cf_table, str(table_model)) # Statistcs for all metrics cm = ConfusionMatrix(outcome, predict) dfp = pd.concat([dfp, pd.DataFrame(cm.overall_stat)[1:]], ignore_index=True) dfp.loc[i:, ['Algo']] = algo i = i + 1 dfp = dfp.round(4) cp = Compare({ 'RF': cm_rf, 'GLM': cm_glm, 'GBM': cm_gbm, 'XGB': cm_xgb, 'DL': cm_dl }) cp_best_name = cp.best_name cp = pd.DataFrame(cp.scores) cp.reset_index(level=0, inplace=True) cp = cp.rename(columns={'index': 'Description'}) table_cp = self.w_table(data=cp, border=0, align='left', collapse='collapse', color='black', foot=False) if str(cp_best_name) == 'None': cp_best_name = 'Confusion matrices are too close and the best one can not be recognized.' max_v = cp.loc[0][1:].max() i = 0 list_max = list() for column in cp.columns: if i > 0: if cp[column][0] >= max_v: list_max.append(column) i = i + 1 self.insert_text( "the_best_name", "Winners: " + ' - '.join(list_max) + '<br>' + cp_best_name) else: self.insert_text("the_best_name", str(cp_best_name)) self.insert_text("best_algorithms", str(table_cp)) self.insert_text("the_best_name", str(cp_best_name)) table_model = self.w_table(data=dfp, border=0, align='left', collapse='collapse', color='black', foot=False) self.insert_text("table_performance", str(table_model)) self.gstep(1, "Closing!! All works are done!!") # write report self.write_report(self.index_html)
iq_testing_frame[col] = iq_testing_frame[col].asfactor() # Training parameters input_columns = list(sj_train.columns) response_column = 'total_cases' input_columns.remove(response_column) # Models sj_min_mae = 1000 sj_best_model = None iq_min_mae = 1000 iq_best_model = None for i in range(n_iterations): model_sj = H2ODeepLearningEstimator(nfolds=10, hidden=[512, 512]) model_sj.train(x=input_columns, y=response_column, training_frame=sj_training_frame) model_iq = H2ODeepLearningEstimator(nfolds=10, hidden=[512, 512]) model_iq.train(x=input_columns, y=response_column, training_frame=iq_training_frame) if model_sj.mae() < sj_min_mae: sj_min_mae = model_sj.mae() sj_best_model = model_sj if model_iq.mae() < iq_min_mae: iq_min_mae = model_iq.mae()
del p_filter['Sensor10'] del p_filter['Sensor16'] del p_filter['Sensor18'] del p_filter['Sensor19'] h_filter = h2o.H2OFrame(p_filter) h_filter.set_names(list(p_filter.columns)) h_test = h2o.H2OFrame(p_test) h_test.set_names(list(p_test.columns)) training_columns = list(p_filter.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') training_columns.remove('BIN') h_filter['BIN'] = h_filter['BIN'].asfactor() h_test['BIN'] = h_test['BIN'].asfactor() model = H2ODeepLearningEstimator(epochs=100, nfolds=10, balance_classes=True) model.train(x=training_columns, y='BIN', training_frame=h_filter) predict = model.predict(test_data=h_test) predict = DataFrameParser.h2oToList(predict['predict']) actual = DataFrameParser.h2oToList(h_test['BIN']) Measures.confusion_matrix(actual, predict) print(predict) print(actual)
# Extract ground truth data ground_truth_data = np.array(p_test[response_column]) # Define columns dl_train_columns = list(p_filtered.columns) rm_columns = ['RUL', 'UnitNumber', 'Time'] for column in rm_columns: dl_train_columns.remove(column) # Building multiple models print "Building Models" print "---------------" model_array = range(_nmodels) for i in range(_nmodels): model_array[i] = H2ODeepLearningEstimator() # Training models print "Training Models" print "---------------" for i in range(_nmodels): model_array[i].train(x=dl_train_columns, y=response_column, training_frame=h_train, nfolds=10) # Validate models and assign weights print "Validating Models" print "-----------------" rmse_vals = np.zeros( shape=_nmodels) # Store root mean squared error of each model
''' q25 = np.percentile(err_list, 25) q75 = np.percentile(err_list, 75) iqr = q75 - q25 rm_index = [] # Stores row numbers which have anomalies for i in range(h_train.nrow): if abs(err_list[i] - q75) > 3 * iqr: rm_index.append(i) # Remove anomalies p_filtered = p_train.drop(p_train.index[rm_index]) # Convert pandas to H2OFrame h_data = h2o.H2OFrame(p_filtered) h_data.set_names(list(p_data.columns)) # Define columns dl_train_columns = list(p_filtered.columns) rm_columns = ['RUL', 'UnitNumber', 'Time'] for column in rm_columns: dl_train_columns.remove(column) #model = H2ODeepLearningEstimator(epochs=100, loss='Automatic', activation='RectifierWithDropout', distribution='poisson', hidden=[512], nfolds=10) model = H2ODeepLearningEstimator(epochs=100, hidden=[512], nfolds=10) model.train(x=dl_train_columns, y=response_column, training_frame=h_data) performance = model.model_performance(test_data=h_test) print(performance)
pValidate = pd.read_csv("hValidateMy.csv") pTest = pd.read_csv("hTestingMy.csv") hTrain = h2o.H2OFrame(pTrain) hTrain.set_names(list(pTrain.columns)) hValidate = h2o.H2OFrame(pValidate) hValidate.set_names(list(pValidate.columns)) hTest = h2o.H2OFrame(pTest) hTest.set_names(list(pTest.columns)) training_columns = list(pTrain.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') response_column = 'RUL' print("OK") model = H2ODeepLearningEstimator(hidden=[1024], activation='Maxout', epochs=100) #model = H2ORandomForestEstimator(ntrees=50, max_depth=20, nbins=100, seed=12345) model.train(x=training_columns, y=response_column, training_frame=hTrain, validation_frame=hValidate) print(model.model_performance(test_data=hTest))
def function(): # AutoEncoder anomaly removal process p_train = ProcessData.trainData(moving_median_centered_average=True, standard_deviation=True, probability_distribution=True, bin_classification=True) p_test = ProcessData.testData(moving_median_centered_average=True, standard_deviation=True, probability_from_file=True, bin_classification=True) # Converting to h2o frane h_test = h2o.H2OFrame(p_test) h_test.set_names(list(p_test.columns)) h_train = h2o.H2OFrame(p_train) h_train.set_names(list(p_train.columns)) # Define autoencoder anomaly_model = H2OAutoEncoderEstimator(activation="Rectifier", hidden=[25, 12, 25], sparse=True, l1=1e-4, epochs=100) # Select relevant features anomaly_train_columns = list(p_train.columns) print(anomaly_train_columns) anomaly_train_columns.remove('RUL') anomaly_train_columns.remove('BIN') anomaly_train_columns.remove('UnitNumber') anomaly_train_columns.remove('Time') anomaly_train_columns.remove('Setting1') anomaly_train_columns.remove('Setting2') anomaly_train_columns.remove('Setting3') # Train model anomaly_model.train(x=anomaly_train_columns, training_frame=h_train) # Get reconstruction error reconstruction_error = anomaly_model.anomaly(test_data=h_train, per_feature=False) error_str = reconstruction_error.get_frame_data() err_list = list(map(float, error_str.split("\n")[1:-1])) err_list = np.array(err_list) # Threshold threshold = np.amax(err_list) * 0.97 print("Max Reconstruction Error :", reconstruction_error.max()) print("Threshold Reconstruction Error :", threshold) # Filter anomalies based on reconstruction error p_filter = Filter.filterDataAutoEncoder(panda_frame=p_train, reconstruction_error=err_list, threshold=threshold) # Drop features del p_filter['Setting3'] del p_filter['Sensor1'] del p_filter['Sensor5'] del p_filter['Sensor10'] del p_filter['Sensor16'] del p_filter['Sensor18'] del p_filter['Sensor19'] h_filter = h2o.H2OFrame(p_filter) h_filter.set_names(list(p_filter.columns)) h_test = h2o.H2OFrame(p_test) h_test.set_names(list(p_test.columns)) training_columns = list(p_filter.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') training_columns.remove('BIN') h_filter['BIN'] = h_filter['BIN'].asfactor() h_test['BIN'] = h_test['BIN'].asfactor() model = H2ODeepLearningEstimator(variable_importances=True) model.train(x=training_columns, y='BIN', training_frame=h_filter, nfolds=10) predict = model.predict(test_data=h_test) predict = DataFrameParser.h2oToList(predict['predict']) actual = DataFrameParser.h2oToList(h_test['BIN']) Measures.confusion_matrix(actual, predict) print(predict) print(actual)
data = h2o.H2OFrame(bands_list) splits = data.split_frame(ratios=[0.7, 0.15], seed=1) train = splits[0] valid = splits[1] test = splits[2] nfolds = 10 fold_assignment = 'Random' model = H2ODeepLearningEstimator(distribution='Gaussian', standardize=True, activation='Rectifier', hidden=[200, 200, 200], l1=1e-5, l2=1e-5, epochs=10, nfolds=nfolds, fold_assignment=fold_assignment, keep_cross_validation_predictions=True) model.train(y="THERMAL", x=[ 'BLUE', 'GREEN', 'RED', 'SEQGREEN', 'SEQRED', 'SEQREDEDGE', 'NIR', 'GNDVI', 'NVDI', 'RENVDI', 'NDSM', 'SLOPE', 'TPI', 'ROUGHNESS' ], training_frame=train, validation_frame=test) metrics = model.model_performance()
training_columns.remove('UnitNumber') training_columns.remove('RUL') training_columns.remove('Time') #filter_train = Process.filterData(panda_frame=train, columns=sustain, removal_method='iqr', threshold=4) filter_train = train feature_engineered_train = ProcessData.trainDataToFrame( training_frame=filter_train, moving_k_closest_average=True, standard_deviation=True) feature_engineered_test = ProcessData.trainDataToFrame( training_frame=test, moving_k_closest_average=True, standard_deviation=True, rul=True) h_train = h2o.H2OFrame(feature_engineered_train) h_train.set_names(list(feature_engineered_train.columns)) h_test = h2o.H2OFrame(feature_engineered_test) h_test.set_names(list(feature_engineered_test.columns)) model = H2ODeepLearningEstimator(epochs=100, hidden=[200, 200], score_each_iteration=True) model.train(x=training_columns, y='RUL', training_frame=h_train) print(model.model_performance(test_data=h_test))
import h2o from h2o.estimators import H2ODeepLearningEstimator h2o.init() train = h2o.import_file('dataset/train.csv') test = h2o.import_file('dataset/test.csv') # define columns response_column = 'SalePrice' training_columns = [ 'SaleType', 'Condition1', 'LandContour', 'Condition2', 'RoofMatl', 'BsmtExposure', 'ExterQual', 'Neighborhood', 'SaleCondition', 'LotConfig', 'OverallQual', 'LotShape', 'PoolQC', 'Heating', 'Functional', 'Street', 'OverallCond', 'RoofStyle', 'GrLivArea', 'CentralAir' ] #train[training_columns].describe() #OverallQual, OverallCond, GrLivArea model = H2ODeepLearningEstimator(nfolds=10, epochs=100) model.train(x=training_columns, y=response_column, training_frame=train) h2o.export_file(frame=model.predict(test_data=test), path='prediction.csv', force=True) print model.model_performance()
pTest = ProcessData.testData(moving_k_closest_average=True, standard_deviation=True, probability_from_file=True) # Training model print "\nTraining Model" print "----------------------------------------------------------------------------------------------------------------" training_columns = list(pData.columns) training_columns.remove(response_column) training_columns.remove('UnitNumber') training_columns.remove('Time') # Create h2o frame using filtered pandas frame hTrain = h2o.H2OFrame(pData) hTrain.set_names(list(pData.columns)) hTest = h2o.H2OFrame(pTest) hTest.set_names(list(pTest.columns)) model = H2ODeepLearningEstimator(hidden=[64, 64, 64], score_each_iteration=True, variable_importances=True, epochs=100, activation='Tanh') model.train(x=training_columns, y=response_column, training_frame=hTrain) print "\nModel Performance" print "----------------------------------------------------------------------------------------------------------------" # Evaluate model print model.model_performance(test_data=hTest)
# Initialize server h2o.init() # Load training data set from csv train = h2o.import_file('dataset/train.csv') # define columns training_columns = [ 'Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition' ] response_column = 'SalePrice' model = H2ODeepLearningEstimator(nfolds=10, variable_importances=True) for i in range(10): model.train(x=training_columns, y=response_column, training_frame=train) model.varimp(use_pandas=True).to_csv('varimp' + str(i) + '.csv')
testing_frame = ProcessData.testData(standard_deviation=True, moving_k_closest_average=True, probability_from_file=True) # create h2o frames train = h2o.H2OFrame(training_frame) test = h2o.H2OFrame(testing_frame) train.set_names(list(training_frame.columns)) test.set_names(list(testing_frame.columns)) # Feature selection training_columns = list(training_frame.columns) training_columns.remove(response_column) training_columns.remove("UnitNumber") training_columns.remove("Time") # Build model model1 = H2ODeepLearningEstimator() # Train model model1.train(x=training_columns, y=response_column, training_frame=train) # End : Deep Learning # ---------------------------------------------------------------------------------------------------------------------- # Begin : Random Forest Regression # ---------------------------------------------------------------------------------------------------------------------- # MKA, SD, PROB _model_name_2 = "Random Forest Regression" print "Model : " + _model_name_2 print "-------------------------"
probability_distribution=True) p_featured_test = ProcessData.testDataToFrame(testing_frame=p_test, selected_column_names=columns, probability_from_file=True) h_filter = h2o.H2OFrame(p_featured_train) h_filter.set_names(list(p_featured_train.columns)) h_test = h2o.H2OFrame(p_featured_test) h_test.set_names(list(p_featured_test.columns)) training_columns = list(p_featured_train.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') model = H2ODeepLearningEstimator(variable_importances=True) model.train(x=columns, y='RUL', training_frame=h_filter, nfolds=10) print(model.model_performance(test_data=h_test)) predict = DataFrameParser.h2oToNumpyArray(model.predict(test_data=h_test)) actual = DataFrameParser.h2oToNumpyArray(h_test['RUL']) # var_imp = model.varimp() # for detail in var_imp: # print detail[0] Chart.residual_histogram(actual, predict) Chart.residual_vs_estimated(actual, predict) Chart.acutal_and_predict(actual, predict)
hData = h2o.H2OFrame(data_frame) hData.set_names(list(data_frame.columns)) hTesting = h2o.H2OFrame(testing_frame) hTesting.set_names(list(testing_frame.columns)) # Split data inti training and validation hTrain, hValidate = hData.split_frame(ratios=[0.8]) h2o.export_file(hTrain, "hTrainMy.csv", force=True) h2o.export_file(hValidate, "hValidateMy.csv", force=True) training_columns = list(pData.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') response_column = 'RUL' print("OK") model = H2ODeepLearningEstimator(hidden=[500, 500], score_each_iteration=True, variable_importances=True, epochs=100) #model = H2ORandomForestEstimator(ntrees=50, max_depth=20, nbins=100, seed=12345) model.train(x=training_columns, y=response_column, training_frame=hTrain, validation_frame=hValidate) print(model.model_performance(test_data=hTesting))
# Set factors insurance["offset"] = insurance["Holders"].log() insurance["Group"] = insurance["Group"].asfactor() insurance["Age"] = insurance["Age"].asfactor() insurance["District"] = insurance["District"].asfactor() # Train model model = H2ODeepLearningEstimator( distribution="tweedie", hidden=[1], epochs=1000, train_samples_per_iteration=-1, reproducible=True, activation="Tanh", single_node_mode=False, balance_classes=False, force_load_balance=False, seed=23123, tweedie_power=1.5, score_training_samples=0, score_validation_samples=0, stopping_rounds=0, ) model.train(x=list(range(3)), y="Claims", training_frame=insurance) # Predict input = {"District": [1], "Group": "1-1.5l", "Age": ">35", "Holders": [3582]} df = pd.DataFrame(input) hf = h2o.H2OFrame(df)
# Load train and test data as H2O frames train = h2o.import_file('processed-data/A1Benchmark_train.csv') test = h2o.import_file('processed-data/A1Benchmark_test.csv') # Define input and response columns response_column = 'is_anomaly' input_columns = train.col_names input_columns.remove(response_column) input_columns.remove('timestamp') print 'Input columns :', input_columns print 'Response column :', response_column # Explicitly imply response column contains label data train[response_column] = train[response_column].asfactor() test[response_column] = test[response_column].asfactor() # Define model and train model model = H2ODeepLearningEstimator(hidden=[20, 20], nfolds=10, epochs=100) model.train(x=input_columns, y=response_column, training_frame=train) # Test model performance = model.model_performance(test_data=test) print performance ''' Sample Result ------------- '''
def bake(self) -> H2ODeepLearningEstimator: fr = eyestate_frame() model = H2ODeepLearningEstimator(epochs=100, reproducible=True) model.train(y="eyeDetection", training_frame=fr) return model