def test_custom_metric_reload(): custom_metric = h2o.upload_custom_metric(CustomNullFunc, func_name="custom_mm") (model1, f_test1) = regression_model(H2OGradientBoostingEstimator, custom_metric) assert_all_metrics_equal(model1, f_test1, "custom_mm", 0) # Redefine custom metric and build a new model custom_metric = h2o.upload_custom_metric(CustomOneFunc, func_name="custom_mm") (model2, f_test2) = regression_model(H2OGradientBoostingEstimator, custom_metric) assert_all_metrics_equal(model2, f_test2, "custom_mm", 1)
def test_custom_metric(self): from custom_metric_class import WeightedFalseNegativeLossMetric train_path = "file://" + unit_test_utils.locate("smalldata/loan.csv") train = h2o.import_file(train_path, destination_frame="loan_train") train["bad_loan"] = train["bad_loan"].asfactor() y = "bad_loan" x = train.col_names x.remove(y) x.remove("int_rate") train["weight"] = train["loan_amnt"] weighted_false_negative_loss_func = h2o.upload_custom_metric( WeightedFalseNegativeLossMetric, func_name="WeightedFalseNegativeLoss", func_file="weighted_false_negative_loss.py") from h2o.estimators import H2OGradientBoostingEstimator gbm = H2OGradientBoostingEstimator( model_id="gbm.hex", custom_metric_func=weighted_false_negative_loss_func) gbm.train(y=y, x=x, training_frame=train, weights_column="weight") perf = gbm.model_performance() self.assertEquals(perf.custom_metric_name(), "WeightedFalseNegativeLoss") self.assertEquals(perf.custom_metric_value(), 0.24579011595430142)
def test_custom_metric_from_str(): custom_metric = h2o.upload_custom_metric(CustomOneFuncStr, class_name="CustomOneFunc", func_name="custom_mm") (model2, f_test2) = regression_model(H2OGradientBoostingEstimator, custom_metric) assert_all_metrics_equal(model2, f_test2, "custom_mm", 1)
def testCustomMetric(loanDatasetPath): train = h2o.import_file(loanDatasetPath, destination_frame="loan_train") train["bad_loan"] = train["bad_loan"].asfactor() y = "bad_loan" x = train.col_names x.remove(y) x.remove("int_rate") train["weight"] = train["loan_amnt"] weightedFalseNegativeLossFunc = h2o.upload_custom_metric(WeightedFalseNegativeLossMetric, func_name="WeightedFalseNegativeLoss", func_file="weighted_false_negative_loss.py") gbm = H2OGradientBoostingEstimator(model_id="gbm.hex", custom_metric_func=weightedFalseNegativeLossFunc) gbm.train(y=y, x=x, training_frame=train, weights_column="weight") perf = gbm.model_performance() assert perf.custom_metric_name() == "WeightedFalseNegativeLoss" assert perf.custom_metric_value() == 0.24579011595430142
import pandas as pd import sys sys.path.append("/home/jeremy/Documents/rinseOverRun/src") from dataModeling.mape import MapeMetric # noqa import h2o # noqa from h2o.estimators import H2OGradientBoostingEstimator, H2ORandomForestEstimator # noqa import matplotlib.pyplot as plt # noqa import numpy as np # noqa from sklearn.preprocessing import MinMaxScaler # noqa h2o.init(port=42222, nthreads=-1) mape_func = h2o.upload_custom_metric(MapeMetric, func_name="MAPE", func_file="mape.py") train = pd.read_csv("data/processed/train.csv", index_col=0) valid = pd.read_csv("data/processed/valid.csv", index_col=0) scaler = MinMaxScaler() target = 'final_rinse_total_turbidity_liter' train[[target]] = scaler.fit_transform(train[[target]]) valid[[target]] = scaler.transform(valid[[target]]) hf, vf = h2o.H2OFrame(train), h2o.H2OFrame(valid) gbm = H2OGradientBoostingEstimator(model_id="Ayaya_gbm", seed=1337, ntrees=300, min_split_improvement=1e-4, learn_rate=1e-3, stopping_metric="custom", stopping_rounds=10, stopping_tolerance=0.001, custom_metric_func=mape_func)
def custom_logloss_mm(): return h2o.upload_custom_metric(CustomLoglossFunc, func_name="logloss", func_file="mm_logloss.py")
def custom_rmse_mm(): return h2o.upload_custom_metric(CustomRmseFunc, func_name="rmse", func_file="mm_rmse.py")
def custom_mae_mm(): return h2o.upload_custom_metric(CustomMaeFunc, func_name="mae", func_file="mm_mae.py")
def train_gradientboosting(self, train: h2o.H2OFrame, x: List[str], y: str, weight: str, cost_matrix_loss_metric: bool) -> H2OGenericEstimator: """ Use a H2O gradient boosting base model and a gridsearch to build model Args: train (h2o dataframe): training data containing columns x, y, and weight x (list of str): column names of model features y (list of str): column name of ground truth weight (str): column name of row weights cost_matrix_loss_metric (bool): indicates if a custom loss function should be used in model selection Return H2OGenericEstimator: best model out of the training grid """ def sort_models(grid: H2OGridSearch) -> List[list]: """ Sorts models in the grid by their custom_metric_value or the score reported by the custom metric set at model declaration. Args: grid (H2OGridSearch): a grid search object containing models with the custom metric Returns: Sorted list of decreasing custom_metric_value """ functioning_list_of_models = [] for model_name in grid.model_ids: try: result = [h2o.get_model(model_name).model_performance(xval=True).custom_metric_value(), model_name] functioning_list_of_models.append(result) except AttributeError: # Some models fail because they don't have a custom_metric_value, it's unclear why at this time print(f"Error with {x}") pass return sorted(functioning_list_of_models) def grid_train(base_model: H2OGradientBoostingEstimator, search_time: int) -> H2OGridSearch: """ Given base model train a search grid to find the optimum hyper parameters Args: base_model (H2OGradientBoostingEstimator): model that should be used in hyper parameter search search_time (int): max time in seconds that h2o should spend searching for a model in the grid Return: H2OGridSearch : trained grid """ gbm_hyper_parameters = {'learn_rate': [0.01, 0.1], 'max_depth': [3, 5, 9], 'sample_rate': [0.8, 1.0], 'col_sample_rate': [0.2, 0.5, 1.0]} logging.info(f"Searching Hyper Parameter Space:\n {gbm_hyper_parameters}") grid = H2OGridSearch(base_model, gbm_hyper_parameters, search_criteria={'strategy': "RandomDiscrete", 'max_runtime_secs': search_time}) grid.train(x=x, y=y, training_frame=train, weights_column=weight, grid_id="gbm_grid") return grid def get_cost_matrix_loss_metric_class() -> object: """ This function modifies the text in the file utils_model_metrics to include the cost dictionary in this instance before importing the file. The strategy is messy and I don't believe it is the correct way to do this, but it is the only way I could find to complete the tasks inside the allotted time today. Returns the class CostMatrixLossMetric with cost dictionary overwritten """ file_path = os.path.join(self.dir_path, 'utils_model_metrics.py') with open(file_path, 'r') as file: file_data = file.read() target = r"\{'cost_tp': -?\d*\.?\d, 'cost_fp': -?\d*\.?\d, 'cost_tn': -?\d*\.?\d*, 'cost_fn': -?\d*\.?\d*\}" file_data = re.sub(target, str(self.inverse_costs), file_data) with open(file_path, 'w') as file: file.write(file_data) print("file written") from .utils_model_metrics import CostMatrixLossMetric return CostMatrixLossMetric if cost_matrix_loss_metric: # If cost_matrix_loss_metric upload it to cluster and include it in base model cost_matrix_loss_metric_func = h2o.upload_custom_metric(get_cost_matrix_loss_metric_class(), func_name="CostMatrixLossMetric", func_file="cost_matrix_loss_metric.py") base_model = H2OGradientBoostingEstimator(custom_metric_func=cost_matrix_loss_metric_func, nfolds=3) gbm_grid = grid_train(base_model, self.search_time) # Custom metrics are not available in .get_grid so we must use our own function to select the # best model best_model = h2o.get_model(sort_models(gbm_grid)[0][1]) else: base_model = H2OGradientBoostingEstimator(nfolds=3) gbm_grid = grid_train(base_model, self.search_time) best_model = gbm_grid.get_grid(sort_by='auc', decreasing=True).models[0] return best_model