Esempio n. 1
0
def expr_math_ops(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    sin_cos_tan_atan_sinh_cosh_tanh_asinh_data = [
        [random.uniform(-10, 10) for r in range(10)] for c in range(10)
    ]
    asin_acos_atanh_data = [[random.uniform(-1, 1) for r in range(10)]
                            for c in range(10)]
    acosh_data = [[random.uniform(1, 10) for r in range(10)]
                  for c in range(10)]
    abs_data = [[random.uniform(-100000, 0) for r in range(10)]
                for c in range(10)]

    h2o_data1_1 = h2o.H2OFrame(
        python_obj=sin_cos_tan_atan_sinh_cosh_tanh_asinh_data)
    h2o_data2_1 = h2o.H2OFrame(python_obj=asin_acos_atanh_data)
    h2o_data3_1 = h2o.H2OFrame(python_obj=acosh_data)
    h2o_data4_1 = h2o.H2OFrame(python_obj=abs_data)

    np_data1 = np.array(sin_cos_tan_atan_sinh_cosh_tanh_asinh_data)
    np_data2 = np.array(asin_acos_atanh_data)
    np_data3 = np.array(acosh_data)
    np_data4 = np.array(abs_data)

    h2o_data1 = h2o_data1_1 + 2
    h2o_data2 = h2o_data2_1 / 1.01
    h2o_data3 = h2o_data3_1 * 1.5
    h2o_data4 = h2o_data4_1 - 1.5

    np_data1 = np_data1 + 2
    np_data2 = np_data2 / 1.01
    np_data3 = np_data3 * 1.5
    np_data4 = np_data4 - 1.5

    h2o.np_comparison_check(h2o_data1.cos(), np.cos(np_data1), 10)
    h2o.np_comparison_check(h2o_data1.sin(), np.sin(np_data1), 10)
    h2o.np_comparison_check(h2o_data1.tan(), np.tan(np_data1), 10)
    h2o.np_comparison_check(h2o_data2.acos(), np.arccos(np_data2), 10)
    h2o.np_comparison_check(h2o_data2.asin(), np.arcsin(np_data2), 10)
    h2o.np_comparison_check(h2o_data1.atan(), np.arctan(np_data1), 10)
    h2o.np_comparison_check(h2o_data1.cosh(), np.cosh(np_data1), 10)
    h2o.np_comparison_check(h2o_data1.sinh(), np.sinh(np_data1), 10)
    h2o.np_comparison_check(h2o_data1.tanh(), np.tanh(np_data1), 10)
    h2o.np_comparison_check(h2o_data3.acosh(), np.arccosh(np_data3), 10)
    h2o.np_comparison_check(h2o_data1.asinh(), np.arcsinh(np_data1), 10)
    h2o.np_comparison_check(h2o_data2.atanh(), np.arctanh(np_data2), 10)
    h2o.np_comparison_check((h2o_data2 / math.pi).cospi(), np.cos(np_data2),
                            10)
    h2o.np_comparison_check((h2o_data2 / math.pi).sinpi(), np.sin(np_data2),
                            10)
    h2o.np_comparison_check((h2o_data2 / math.pi).tanpi(), np.tan(np_data2),
                            10)
    h2o.np_comparison_check(h2o_data4.abs(), np.fabs(np_data4), 10)
    h2o.np_comparison_check(h2o_data2.sign(), np.sign(np_data2), 10)
    h2o.np_comparison_check(h2o_data3.sqrt(), np.sqrt(np_data3), 10)
    h2o.np_comparison_check(h2o_data3.trunc(), np.trunc(np_data3), 10)
    h2o.np_comparison_check(h2o_data3.ceil(), np.ceil(np_data3), 10)
    h2o.np_comparison_check(h2o_data3.floor(), np.floor(np_data3), 10)
    h2o.np_comparison_check(h2o_data3.log(), np.log(np_data3), 10)
    h2o.np_comparison_check(h2o_data3.log10(), np.log10(np_data3), 10)
    h2o.np_comparison_check(h2o_data3.log1p(), np.log1p(np_data3), 10)
    h2o.np_comparison_check(h2o_data3.log2(), np.log2(np_data3), 10)
    h2o.np_comparison_check(h2o_data3.exp(), np.exp(np_data3), 10)
    h2o.np_comparison_check(h2o_data3.expm1(), np.expm1(np_data3), 10)
    h2o_val = h2o_data3.gamma()[5, 5]
    num_val = math.gamma(h2o_data3[5, 5])
    assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal gamma values between h2o and " \
        "math".format(h2o_val,num_val)
    h2o_val = h2o_data3.lgamma()[5, 5]
    num_val = math.lgamma(h2o_data3[5, 5])
    assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal lgamma values between h2o and " \
        "math".\
            format(h2o_val,num_val)
    h2o_val = h2o_data3.digamma()[5, 5]
    num_val = scipy.special.polygamma(0, h2o_data3[5, 5])
    assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal digamma values between h2o and " \
        "math"\
            .format(h2o_val,num_val)
    h2o_val = h2o_data3.trigamma()[5, 5]
    num_val = float(scipy.special.polygamma(1, h2o_data3[5, 5]))
    assert abs(h2o_val - num_val) < max(abs(h2o_val), abs(num_val)) * 1e-6, \
        "check unsuccessful! h2o computed {0} and math computed {1}. expected equal trigamma values between h2o and " \
        "math".format(h2o_val,num_val)
# Load data sets
pd_train = pd.read_csv('na_filled.csv')
pd_labels = pd.read_csv('dataset/dengue_labels_train.csv')
pd_test = pd.read_csv('dataset/dengue_features_test.csv')
pd_submit = pd.read_csv('dataset/submission_format.csv')

# Identifying columns
response_column = 'total_cases'
training_columns = list(pd_test.columns)

# Merging labels with training data
pd_train[response_column] = pd_labels[response_column]

# Create h2o frames
hd_train = h2o.H2OFrame(pd_train)
hd_train.set_names(list(pd_train.columns))
hd_test = h2o.H2OFrame(pd_test)
hd_test.set_names(list(pd_test.columns))

h2o.export_file(frame=hd_train, path='h2o_train.csv', force=True)
h2o.export_file(frame=hd_test, path='h2o_test.csv', force=True)

# Defining machine learning model
# model = H2ODeepLearningEstimator(epochs=100, hidden=[128, 128, 128], nfolds=10)
model = H2ORandomForestEstimator(ntrees=100,
                                 max_depth=20,
                                 binomial_double_trees=True)

# Train model
model.train(x=training_columns, y=response_column, training_frame=hd_train)
Esempio n. 3
0
print(len(anomaly_series))

# Remove anomalies
df = pData.drop(pData.index[anomaly_series])

# Feature engineering
data_frame = ProcessData.trainDataToFrame(df,
                                          moving_k_closest_average=True,
                                          standard_deviation=True,
                                          moving_median=True)
testing_frame = ProcessData.testData(moving_k_closest_average=True,
                                     standard_deviation=True,
                                     moving_median=True)

# Create h2o frame
hData = h2o.H2OFrame(data_frame)
hData.set_names(list(data_frame.columns))

hTesting = h2o.H2OFrame(testing_frame)
hTesting.set_names(list(testing_frame.columns))

# Split data inti training and validation
hTrain, hValidate = hData.split_frame(ratios=[0.8])
h2o.export_file(hTrain, "hTrainMy.csv", force=True)
h2o.export_file(hValidate, "hValidateMy.csv", force=True)

training_columns = list(pData.columns)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
training_columns.remove('RUL')
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        X = dt.Frame(X)

        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = None

        if isinstance(self, H2ONBModel):
            # NB can only handle weights of 0 / 1
            if sample_weight is not None:
                sample_weight = (sample_weight != 0).astype(int)
            if sample_weight_eval_set is not None:
                sample_weight_eval_set = [(sample_weight_eval_set[0] != 0).astype(int)]

        train_X = h2o.H2OFrame(X.to_pandas())
        self.col_types = train_X.types
        train_y = h2o.H2OFrame(y,
                               column_names=[self.target],
                               column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
            valid_y = h2o.H2OFrame(eval_set[0][1],
                                   column_names=[self.target],
                                   column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()
            params = copy.deepcopy(self.params)
            if not isinstance(self, H2OAutoMLModel):
                # AutoML needs max_runtime_secs in initializer, all others in train() method
                max_runtime_secs = params.pop('max_runtime_secs')
                train_kwargs = dict(max_runtime_secs=max_runtime_secs)
            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight
            model = self.make_instance(**params)

            # Don't ever use the offset column as a feature
            offset_col = None  # if no column is called offset we will pass "None" and not use this feature
            cols_to_train = []  # list of all non-offset columns

            for col in list(train_X.names):
                if not col.lower() == "offset":
                    cols_to_train.append(col)
                else:
                    offset_col = col

            orig_cols = cols_to_train  # not training on offset

            # Models that can use an offset column
            if isinstance(model, H2OGBMModel) | isinstance(model, H2ODLModel) | isinstance(model, H2OGLMModel):
                model.train(x=cols_to_train, y=self.target, training_frame=train_frame, offset_column=offset_col,
                            **train_kwargs)
            else:
                model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs)

            if isinstance(model, H2OAutoML):
                model = model.leader
            self.id = model.model_id
            model_path = os.path.join(user_dir(), "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                remove(model_path)
            for xx in [train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y]:
                if xx is not None:
                    if isinstance(xx, H2OAutoML):
                        h2o.remove(xx.project_name)
                    else:
                        h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            df_varimp.index = df_varimp['variable']
            df_varimp = df_varimp.iloc[:, 1]  # relative importance
            for missing in [x for x in orig_cols if x not in list(df_varimp.index)]:
                # h2o3 doesn't handle raw strings all the time, can hit:
                # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]"
                df_varimp[missing] = 0
            varimp = df_varimp[orig_cols].values  # order by fitted features
            varimp = np.nan_to_num(varimp)

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))
Esempio n. 5
0
import h2o
from h2o.estimators import H2ODeepLearningEstimator

from dataprocessor import ProcessData

from h2o.estimators import H2ORandomForestEstimator
from h2o.grid import H2OGridSearch

# Initialize server
h2o.init()

data = ProcessData.trainData(moving_k_closest_average=True,
                             standard_deviation=True,
                             probability_distribution=True)

hData = h2o.H2OFrame(data)
hData.set_names(list(data.columns))

training_columns = list(data.columns)
training_columns.remove('RUL')
training_columns.remove('UnitNumber')
training_columns.remove('Time')

# hyper_parameters = {'ntrees': [10, 50], 'max_depth': [20, 10]}
# grid_search = H2OGridSearch(H2ORandomForestEstimator, hyper_params=hyper_parameters)
# grid_search.train(x=training_columns, y='RUL', training_frame=hData)
# grid_search.show()
# models = grid_search.sort_by("mse")
# print models

hyper_parameters = {
Esempio n. 6
0
    "J": 0,
    "SI1": 3,
    "VS2": 4,
    "SI2": 2,
    "VS1": 5,
    "VVS2": 6,
    "VVS1": 7,
    "IF": 8,
    "I1": 1
}

train.replace(ord_cut, inplace=True)
test.replace(ord_cut, inplace=True)

# Using h2o syntax, we import the datasets as h2o
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

# Set train-test dataframes
y = "price"
x = train.columns
x.remove(y)

train_final, valid = train.split_frame(
    ratios=[0.8])  # We make the split of training and validation

modelo = H2OGradientBoostingEstimator(
    ntrees=440,
    learn_rate=0.5531490180631663,
    max_depth=10,
    #sample_rate: 0.6117256495829282,
Esempio n. 7
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        X = dt.Frame(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = None

        orig_cols = list(X.names)
        y = self.make_y_nonnegative(y)
        train_X = h2o.H2OFrame(X.to_pandas())
        self.col_types = train_X.types
        train_y = h2o.H2OFrame(
            y,
            column_names=[self.target],
            column_types=[
                'categorical' if self.num_classes >= 2 else 'numeric'
            ])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(),
                                   column_types=self.col_types)
            valid_y = eval_set[0][1]
            valid_y = self.make_y_nonnegative(valid_y)
            valid_y = h2o.H2OFrame(
                valid_y,
                column_names=[self.target],
                column_types=[
                    'categorical' if self.num_classes >= 2 else 'numeric'
                ])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()

            max_runtime_secs = self.params.get('max_runtime_secs', 0)
            train_kwargs = dict(max_runtime_secs=max_runtime_secs)

            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight
            model = self.make_instance(**self.params)
            model.train(x=train_X.names,
                        y=self.target,
                        training_frame=train_frame,
                        **train_kwargs)
            self.id = model.model_id
            model_path = os.path.join(user_dir(),
                                      "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                remove(model_path)
            for xx in [
                    train_frame, train_X, train_y, model, valid_frame, valid_X,
                    valid_y
            ]:
                if xx is not None:
                    h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            df_varimp.index = df_varimp['variable']
            df_varimp = df_varimp.iloc[:, 1]  # relative importance
            for missing in [
                    x for x in orig_cols if x not in list(df_varimp.index)
            ]:
                # h2o3 doesn't handle raw strings all the time, can hit:
                # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]"
                df_varimp[missing] = 0
            varimp = df_varimp[orig_cols].values  # order by fitted features
            varimp = np.nan_to_num(varimp)

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))
Esempio n. 8
0
sys.path.insert(1, 'J:\\AFL')
import afl_functions as afl

import pandas as pd
import h2o
import math

fixture = afl.get_fixtureAFL()
fixture.to_csv(
    'J:\\AFL\\ToScore.csv',
    index=False)  #check to make sure this file is for the round required

from h2o.automl import H2OAutoML

h2o.init()

model_path = 'J:\\AFL\\ModelGiving422018trainedON20112017\\StackedEnsemble_AllModels_0_AutoML_20181106_085551'
saved_model = h2o.load_model(model_path)

scoring_data = afl.get_data(season_from=2019,
                            season_to=2019,
                            proxy=False,
                            train_mode=False)

hf_score = h2o.H2OFrame(scoring_data)
prediction = saved_model.predict(hf_score)

prediction = prediction.as_data_frame(use_pandas=True)
prediction.to_csv('J:\\AFL\\PredictionRun.csv')
scoring_data.to_csv('J:\\AFL\\temp.csv')
Esempio n. 9
0
def weights_check(ip, port):
    def check_same(data1, data2, min_rows_scale):
        gbm1_regression = h2o.gbm(x=data1[[
            "displacement", "power", "weight", "acceleration", "year"
        ]],
                                  y="economy",
                                  training_frame=data1,
                                  min_rows=5,
                                  ntrees=5,
                                  max_depth=5)
        gbm2_regression = h2o.gbm(x=data2[[
            "displacement", "power", "weight", "acceleration", "year",
            "weights"
        ]],
                                  y=data2["economy"],
                                  min_rows=5 * min_rows_scale,
                                  weights_column=data2["weights"],
                                  ntrees=5,
                                  max_depth=5)
        gbm1_binomial = h2o.gbm(x=data1[[
            "displacement", "power", "weight", "acceleration", "year"
        ]],
                                y=data1["economy_20mpg"],
                                min_rows=5,
                                distribution="bernoulli",
                                ntrees=5,
                                max_depth=5)
        gbm2_binomial = h2o.gbm(x=data2[[
            "displacement", "power", "weight", "acceleration", "year",
            "weights"
        ]],
                                y=data2["economy_20mpg"],
                                weights_column="weights",
                                training_frame=data2,
                                min_rows=5 * min_rows_scale,
                                distribution="bernoulli",
                                ntrees=5,
                                max_depth=5)
        gbm1_multinomial = h2o.gbm(x=data1[[
            "displacement", "power", "weight", "acceleration", "year"
        ]],
                                   y=data1["cylinders"],
                                   min_rows=5,
                                   distribution="multinomial",
                                   ntrees=5,
                                   max_depth=5)
        gbm2_multinomial = h2o.gbm(x=data2[[
            "displacement", "power", "weight", "acceleration", "year",
            "weights"
        ]],
                                   y=data2["cylinders"],
                                   weights_column="weights",
                                   training_frame=data2,
                                   min_rows=5 * min_rows_scale,
                                   distribution="multinomial",
                                   ntrees=5,
                                   max_depth=5)

        reg1_mse = gbm1_regression.mse()
        reg2_mse = gbm2_regression.mse()
        bin1_auc = gbm1_binomial.auc()
        bin2_auc = gbm2_binomial.auc()
        mul1_mse = gbm1_multinomial.mse()
        mul2_mse = gbm2_multinomial.mse()

        print "MSE (regresson)   no weights vs. weights: {0}, {1}".format(
            reg1_mse, reg2_mse)
        print "AUC (binomial)    no weights vs. weights: {0}, {1}".format(
            bin1_auc, bin2_auc)
        print "MSE (multinomial) no weights vs. weights: {0}, {1}".format(
            mul1_mse, mul2_mse)

        assert abs(
            reg1_mse - reg2_mse
        ) < 1e-6 * reg1_mse, "Expected mse's to be the same, but got {0}, and {1}".format(
            reg1_mse, reg2_mse)
        assert abs(
            bin1_auc - bin2_auc
        ) < 1e-6 * bin1_auc, "Expected auc's to be the same, but got {0}, and {1}".format(
            bin1_auc, bin2_auc)
        assert abs(
            mul1_mse - mul1_mse
        ) < 1e-6 * mul1_mse, "Expected auc's to be the same, but got {0}, and {1}".format(
            mul1_mse, mul2_mse)

    h2o_cars_data = h2o.import_file(
        h2o.locate("smalldata/junit/cars_20mpg.csv"))
    h2o_cars_data["economy_20mpg"] = h2o_cars_data["economy_20mpg"].asfactor()
    h2o_cars_data["cylinders"] = h2o_cars_data["cylinders"].asfactor()

    # uniform weights same as no weights
    random.seed(2222)
    weight = random.randint(1, 10)
    uniform_weights = [[weight] for r in range(406)]
    h2o_uniform_weights = h2o.H2OFrame(python_obj=uniform_weights)
    h2o_uniform_weights.setNames(["weights"])
    h2o_data_uniform_weights = h2o_cars_data.cbind(h2o_uniform_weights)

    print "Checking that using uniform weights is equivalent to no weights:"
    print
    check_same(h2o_cars_data, h2o_data_uniform_weights, weight)

    # zero weights same as removed observations
    zero_weights = [[0] if random.randint(0, 1) else [1] for r in range(406)]
    h2o_zero_weights = h2o.H2OFrame(python_obj=zero_weights)
    h2o_zero_weights.setNames(["weights"])
    h2o_data_zero_weights = h2o_cars_data.cbind(h2o_zero_weights)
    h2o_data_zeros_removed = h2o_cars_data[h2o_zero_weights["weights"] == 1]

    print "Checking that using some zero weights is equivalent to removing those observations:"
    print
    check_same(h2o_data_zeros_removed, h2o_data_zero_weights, 1)

    # doubled weights same as doubled observations
    doubled_weights = [[1] if random.randint(0, 1) else [2]
                       for r in range(406)]
    h2o_doubled_weights = h2o.H2OFrame(python_obj=doubled_weights)
    h2o_doubled_weights.setNames(["weights"])
    h2o_data_doubled_weights = h2o_cars_data.cbind(h2o_doubled_weights)

    doubled_data = h2o.as_list(h2o_cars_data, use_pandas=False)
    colnames = doubled_data.pop(0)
    for idx, w in enumerate(doubled_weights):
        if w[0] == 2: doubled_data.append(doubled_data[idx])
    h2o_data_doubled = h2o.H2OFrame(python_obj=doubled_data)
    h2o_data_doubled.setNames(colnames)

    h2o_data_doubled["economy_20mpg"] = h2o_data_doubled[
        "economy_20mpg"].asfactor()
    h2o_data_doubled["cylinders"] = h2o_data_doubled["cylinders"].asfactor()
    h2o_data_doubled_weights["economy_20mpg"] = h2o_data_doubled_weights[
        "economy_20mpg"].asfactor()
    h2o_data_doubled_weights["cylinders"] = h2o_data_doubled_weights[
        "cylinders"].asfactor()

    print "Checking that doubling some weights is equivalent to doubling those observations:"
    print
    check_same(h2o_data_doubled, h2o_data_doubled_weights, 1)
Esempio n. 10
0
def interactions_GLM_Binomial():
    # test multiple interactions_GLM_Binomial enum by enum, enum by num and num by num all with NA terms
    print("******* Test interaction pairs")
    pd_df_NA = pd.DataFrame(np.array(
        [[1, 0, 1, 0, 1, 0], [1, 2, 4.2 / 2.2, 4, 3, 1],
         [2, 3, float('NaN'), 1, 2, 3], ["a", "a", "a", "b", "a", "b"],
         ['Foo', 'UNKNOWN', 'Foo', 'Foo', 'Foo', 'Bar']]).T,
                            columns=[
                                'label', 'numerical_feat', 'numerical_feat2',
                                'categorical_feat', 'categorical_feat2'
                            ])
    h2o_df_NA = h2o.H2OFrame(pd_df_NA, na_strings=["UNKNOWN"])
    pd_df = pd.DataFrame(np.array([[1, 0, 1, 0, 1, 0],
                                   [1, 2, 4.2 / 2.2, 4, 3, 1],
                                   [2, 3, 2.2, 1, 2, 3],
                                   ["a", "a", "a", "b", "a", "b"],
                                   ['Foo', 'Foo', 'Foo', 'Foo', 'Foo',
                                    'Bar']]).T,
                         columns=[
                             'label', 'numerical_feat', 'numerical_feat2',
                             'categorical_feat', 'categorical_feat2'
                         ])
    h2o_df = h2o.H2OFrame(pd_df, na_strings=["UNKNOWN"])

    interaction_pairs = [("numerical_feat", "numerical_feat2"),
                         ("numerical_feat", "categorical_feat2"),
                         ("categorical_feat", "categorical_feat2")]
    xcols = [
        'numerical_feat', 'numerical_feat2', 'categorical_feat',
        'categorical_feat2'
    ]

    # build model with and without NA in Frame
    modelNA = H2OGeneralizedLinearEstimator(
        family="Binomial",
        alpha=0,
        lambda_search=False,
        interaction_pairs=interaction_pairs,
        standardize=False)
    modelNA.train(x=xcols, y='label', training_frame=h2o_df_NA)
    # build model with and without NA in Frame
    model = H2OGeneralizedLinearEstimator(family="Binomial",
                                          alpha=0,
                                          lambda_search=False,
                                          interaction_pairs=interaction_pairs,
                                          standardize=False)
    model.train(x=xcols, y='label', training_frame=h2o_df)
    assert_arrays_equal_NA(
        modelNA._model_json['output']['coefficients_table'].cell_values,
        model._model_json['output']['coefficients_table'].cell_values)

    # test interaction of num and num columns
    print("******* Test interaction with num by num")
    pd_df_num_num_NA = pd.DataFrame(
        np.array([[1, 0, 1, 0], [1, 2, 2, 4], [2, 3, float('NaN'), 1]]).T,
        columns=['label', 'numerical_feat', 'numerical_feat2'])
    pd_df_num_num = pd.DataFrame(
        np.array([[1, 0, 1, 0], [1, 2, 2, 4], [2, 3, 2, 1]]).T,
        columns=['label', 'numerical_feat', 'numerical_feat2'])
    performOneTest(pd_df_num_num_NA,
                   pd_df_num_num,
                   interactionColumn=['numerical_feat', 'numerical_feat2'],
                   xcols=['numerical_feat', 'numerical_feat2'],
                   standard=False)

    # test interaction of enum and enum columns
    print("******* Test interaction with enum by enum")
    pd_df_cat_cat_NA = pd.DataFrame(
        np.array([[1, 0, 1, 0], ["a", "a", "b", "b"],
                  ['Foo', 'UNKNOWN', 'Foo', 'Bar']]).T,
        columns=['label', 'categorical_feat', 'categorical_feat2'])
    pd_df_cat_cat = pd.DataFrame(
        np.array([[1, 0, 1, 0], ["a", "a", "b", "b"],
                  ['Foo', 'Foo', 'Foo', 'Bar']]).T,
        columns=['label', 'categorical_feat', 'categorical_feat2'])
    performOneTest(pd_df_cat_cat_NA,
                   pd_df_cat_cat,
                   interactionColumn=['categorical_feat', 'categorical_feat2'],
                   xcols=['categorical_feat', 'categorical_feat2'])

    # test interaction of enum and num columns
    print("******* Test interaction with enum by num")
    pd_df_cat_num_NA = pd.DataFrame(
        np.array([[1, 0, 1, 0], [1, 2, 3, 4], ['Foo', 'UNKNOWN', 'Foo',
                                               'Bar']]).T,
        columns=['label', 'numerical_feat', 'categorical_feat'])
    pd_df_cat_num = pd.DataFrame(
        np.array([[1, 0, 1, 0], [1, 2, 3, 4], ['Foo', 'Foo', 'Foo', 'Bar']]).T,
        columns=['label', 'numerical_feat', 'categorical_feat'])
    performOneTest(pd_df_cat_num_NA,
                   pd_df_cat_num,
                   interactionColumn=['numerical_feat', 'categorical_feat'],
                   xcols=['numerical_feat', 'categorical_feat'])
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        X = dt.Frame(X)
        h2o.init(port=config.h2o_recipes_port)
        model_path = None

        orig_cols = list(X.names)
        train_X = h2o.H2OFrame(X.to_pandas())
        train_y = h2o.H2OFrame(
            y,
            column_names=[self.target],
            column_types=[
                'categorical' if self.num_classes >= 2 else 'numeric'
            ])
        train_frame = train_X.cbind(train_y)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas())
            valid_y = h2o.H2OFrame(
                eval_set[0][1],
                column_names=[self.target],
                column_types=[
                    'categorical' if self.num_classes >= 2 else 'numeric'
                ])
            valid_frame = valid_X.cbind(valid_y)

        try:
            model = self.make_instance()
            model.train(x=train_X.names,
                        y=self.target,
                        training_frame=train_frame,
                        validation_frame=valid_frame)
            self.id = model.model_id
            model_path = os.path.join(temporary_files_path,
                                      "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                os.remove(model_path)
            for xx in [
                    train_frame, train_X, train_y, model, valid_frame, valid_X,
                    valid_y
            ]:
                if xx is not None:
                    h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            df_varimp.index = df_varimp['variable']
            df_varimp = df_varimp.iloc[:, 1]  # relative importance
            varimp = df_varimp[orig_cols].values  # order by fitted features

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))