Ejemplo n.º 1
0
def mojo_model_glm_test():

    # GLM
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    glm = H2OGeneralizedLinearEstimator(nfolds=3)
    glm.train(x=["Origin", "Dest"],
              y="Distance",
              training_frame=airlines,
              validation_frame=airlines)

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = glm.download_mojo(original_model_filename)

    model = H2OGenericEstimator.from_file(original_model_filename)
    assert model is not None
    print(model)
    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(
        original_model_filename)
Ejemplo n.º 2
0
def test(x, y, output_test, strip_part, algo_name, generic_algo_name, family):

    # GLM
    airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    glm = H2OGeneralizedLinearEstimator(nfolds = 3, family = family, alpha = 1, lambda_ = 1)
    glm.train(x = x, y = y, training_frame=airlines, validation_frame=airlines, )
    print(glm)
    with Capturing() as original_output:
        glm.show()
    original_model_filename = tempfile.mkdtemp()
    original_model_filename = glm.download_mojo(original_model_filename)

    generic_mojo_model_from_file = H2OGenericEstimator.from_file(original_model_filename)
    assert generic_mojo_model_from_file is not None
    print(generic_mojo_model_from_file)
    compare_params(glm, generic_mojo_model_from_file)
    with Capturing() as generic_output:
        generic_mojo_model_from_file.show()

    output_test(str(original_output), str(generic_output), strip_part, algo_name, generic_algo_name)
    predictions = generic_mojo_model_from_file.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert generic_mojo_model_from_file._model_json["output"]["model_summary"] is not None
    assert len(generic_mojo_model_from_file._model_json["output"]["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo");
    generic_mojo_filename = generic_mojo_model_from_file.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(original_model_filename)
def test_big_data_cars():
    """
    Test big data dataset, with metric logloss. 
    """
    h2o_df = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/lending-club/loan.csv"))
    predictors = h2o_df.col_names
    response_col = h2o_df.col_names[12]  # loan amount
    predictors.remove(response_col)

    model = H2OGeneralizedLinearEstimator(family="binomial")
    model.train(y=response_col, x=predictors, training_frame=h2o_df)

    metric = "logloss"

    pm_h2o_df = model.permutation_importance(h2o_df, use_pandas=True, n_samples=-1, metric=metric)
    for pred in predictors:
        if pred == "Variable":
            continue
        assert isinstance(pm_h2o_df.loc[pred, "Relative Importance"], float)  # Relative PFI

    pm_h2o_df = model.permutation_importance(h2o_df, use_pandas=True, n_samples=100, metric=metric)
    for pred in predictors:
        if pred == "Variable":
            continue
        assert isinstance(pm_h2o_df.loc[pred, "Relative Importance"], float)  # Relative PFI
Ejemplo n.º 4
0
 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = stars_frame()
     assert fr.type("distance") == "int"
     model = H2OGeneralizedLinearEstimator()
     model.train(y="distance",
                 training_frame=fr,
                 ignored_columns=["name1", "name2"])
     return model
Ejemplo n.º 5
0
 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = names_frame()
     fr = fr[:5000, :]
     fr["name"] = fr["name"].ascharacter().asfactor()  # trim nlevels()
     assert 256 < fr["name"].nlevels()[0] < 500
     model = H2OGeneralizedLinearEstimator()
     model.train(y="sex", training_frame=fr)
     return model
Ejemplo n.º 6
0
def test_glm_params():
    H2OGeneralizedLinearEstimator()
    H2OGeneralizedLinearEstimator(nfolds=5, seed=1000, alpha=0.5)

    df = h2o.H2OFrame.from_python({
        "response": [1, 2, 3, 4, 5],
        "a": [0, 1, 0, 1, 0],
        "b": [-1, 3, 7, 11, 20],
        "n": [0] * 5,
        "w": [1] * 5
    })

    model = H2OGeneralizedLinearEstimator()
    model.training_frame = df
    model.validation_frame = df
    model.nfolds = 3
    model.keep_cross_validation_predictions = True
    model.keep_cross_validation_fold_assignment = True
    model.fold_assignment = "random"
    model.fold_column = "b"
    model.response_column = "response"
    model.ignored_columns = ["x", "y"]
    model.ignore_const_cols = True
    model.score_each_iteration = True
    model.offset_column = "n"
    model.weights_column = "w"
    model.family = "MultiNomial"
    model.family = "GAUSSIAN"
    model.family = "Twee-die"
    model.family = "'poIssoN'"
    model.tweedie_variance_power = 1
    model.tweedie_link_power = 2
    model.solver = "CoordinateDescentNaive"

    try:
        model.fold_assignment = "pseudo-random"
        assert False
    except H2OTypeError:
        pass

    try:
        model.ignored_columns = "c"
        assert False
    except H2OTypeError:
        pass
Ejemplo n.º 7
0
def model(train, test):

    today = datetime.datetime.today().today().strftime('%Y-%m-%d:%H:%M')

    from h2o.estimators import H2OGeneralizedLinearEstimator

    h2o_train = h2o.H2OFrame(train)
    h2o_test = h2o.H2OFrame(test)

    predictor_columns = [
        c for c in h2o_train.drop('Wait_Time').col_names if c not in 'Unit'
    ]
    response_column = 'Wait_Time'

    h2o_train[predictor_columns] = h2o_train[predictor_columns].asfactor()
    h2o_test[predictor_columns] = h2o_test[predictor_columns].asfactor()

    #   train, valid = h2o_train.split_frame([.99],seed=615)

    glm_model = H2OGeneralizedLinearEstimator(
        family='Gamma',  #Gaussian , Gamma
        lambda_=0,
        alpha=0,
        compute_p_values=True,
        remove_collinear_columns=True,
        seed=615,
        fold_assignment="Modulo",  ### "Modulo"
        keep_cross_validation_predictions=True,
        nfolds=7)

    glm_model.train(predictor_columns,
                    response_column,
                    training_frame=h2o_train,
                    validation_frame=h2o_test)

    glm_model.model_performance(h2o_train)
    glm_model.model_performance(h2o_test)

    prediction = glm_model.predict(h2o_test).as_data_frame()
    prediction['pred_min'] = (prediction.predict / 60) * 10
    prediction['StdErr_min'] = (prediction.StdErr / 60)
    pred_table = test[['Unit', 'Week']].merge(prediction,
                                              how='outer',
                                              left_index=True,
                                              right_index=True)

    coef_table = glm_model._model_json['output'][
        'coefficients_table'].as_data_frame()

    pred_table[pred_table.Unit == 'Essex']

    coef_table.to_csv('/home/mark/Desktop/IB_docs/coef_table' + today + '.csv',
                      index=False)
    pred_table.to_csv('/home/mark/Desktop/IB_docs/pred_table' + today + '.csv',
                      index=False)
    return
Ejemplo n.º 8
0
    def demo_body(go):
        """
        Demo of H2O's Generalized Linear Estimator.

        This demo uploads a dataset to h2o, parses it, and shows a description.
        Then it divides the dataset into training and test sets, builds a GLM
        from the training set, and makes predictions for the test set.
        Finally, default performance metrics are displayed.
        """
        go()
        # Connect to H2O
        h2o.init()

        go()
        # Upload the prostate dataset that comes included in the h2o python package
        prostate = h2o.upload_file(data_file("h2o_data/prostate.csv"))

        go()
        # Print a description of the prostate data
        prostate.summary()

        go()
        # Randomly split the dataset into ~70/30, training/test sets
        r = prostate[0].runif()
        train = prostate[r < 0.70]
        test = prostate[r >= 0.70]

        go()
        # Convert the response columns to factors (for binary classification problems)
        train["CAPSULE"] = train["CAPSULE"].asfactor()
        test["CAPSULE"] = test["CAPSULE"].asfactor()

        go()
        # Build a (classification) GLM
        from h2o.estimators import H2OGeneralizedLinearEstimator
        prostate_glm = H2OGeneralizedLinearEstimator(family="binomial",
                                                     alpha=[0.5])
        prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                           y="CAPSULE",
                           training_frame=train)

        go()
        # Show the model
        prostate_glm.show()

        go()
        # Predict on the test set and show the first ten predictions
        predictions = prostate_glm.predict(test)
        predictions.show()

        go()
        # Show default performance metrics
        performance = prostate_glm.model_performance(test)
        performance.show()
Ejemplo n.º 9
0
def test_GLM_throws_ArrayOutOfBoundException():
    nFold = 5
    fr = h2o.import_file(
        pyunit_utils.locate("bigdata/laptop/jira/christine.arff"))
    splitFrame = fr.split_frame(ratios=[0.05])
    glm = H2OGeneralizedLinearEstimator(family='binomial',
                                        nfolds=nFold,
                                        lambda_search=True,
                                        alpha=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
    glm.train(y=0, training_frame=splitFrame[0])
    assert len(glm._model_json["output"]['cross_validation_models'])==nFold, \
        "expected number of cross_validation_model: {0}.  Actual number of cross_validation: " \
        "{1}".format(len(glm._model_json["output"]['cross_validation_models']), nFold)
Ejemplo n.º 10
0
def pubdev_5265():
    training_data = {
        'response': [
            'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C',
            'C', 'C', 'C', 'C', 'C'
        ],
        'explanatory':
        ['nan', 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]
    }

    test_data = {
        'response': [
            'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C',
            'C', 'C', 'C', 'C', 'C'
        ],
        'explanatory':
        ['nan', 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4]
    }

    training_data = h2o.H2OFrame(training_data)
    training_data['explanatory'] = training_data['explanatory'].asfactor()

    test_data = h2o.H2OFrame(test_data)
    test_data['explanatory'] = test_data['explanatory'].asfactor()

    glm_estimator = H2OGeneralizedLinearEstimator(
        family="multinomial",
        missing_values_handling="MeanImputation",
        seed=1234,
        Lambda=0)

    glm_estimator.train(x=["explanatory"],
                        y="response",
                        training_frame=training_data)

    # Training on the given dataset should not fail if there is a missing categorical variable (present in training dataset)
    with warnings.catch_warnings(record=True) as w:
        grouped_occurances = glm_estimator.predict(test_data=test_data).group_by((0)).count().get_frame() \
            .as_data_frame()
        assert "Test/Validation dataset column 'explanatory' has levels not trained on: [4]" in str(
            w[-1].message)

    # The very first value corresponding to 'A' in the explanatory variable column should be replaced by the mode value, which is 3.
    # As a result, 8 occurances of type C should be predicted
    grouped_occurances.as_matrix().tolist() == [['A', 4], ['B', 6], ['C', 8]]
Ejemplo n.º 11
0
def main():
    h2o.init()

    #df = h2o.import_file(path="smalldata/logreg/prostate.csv")
    prostate = h2o.load_dataset("prostate")
    prostate.describe()

    train, test = prostate.split_frame(ratios=[0.70])
    train["CAPSULE"] = train["CAPSULE"].asfactor()
    test["CAPSULE"] = test["CAPSULE"].asfactor()

    # Train model
    from h2o.estimators import H2OGeneralizedLinearEstimator
    prostate_glm = H2OGeneralizedLinearEstimator(family="binomial",
                                                 alpha=[0.5])
    prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                       y="CAPSULE",
                       training_frame=train)
    prostate_glm.show()

    predictions = prostate_glm.predict(test)
    predictions.show()

    performance = prostate_glm.model_performance(test)
    performance.show()

    # Export model
    model_path = h2o.save_model(prostate_glm, path="./h2o_model", force=True)
    print(model_path)

    model = prostate_glm
    predictions = model.predict(test)
    predictions.show()

    performance = model.model_performance(test)
    performance.show()

    # Export test data
    df = test.as_data_frame()
    with open("data.json", "w") as f:
        #json.dump(df.to_json(orient='records'), f)
        #json.dump(df.to_json(orient='columns'), f)
        json.dump(df.to_json(orient='index'), f)
Ejemplo n.º 12
0
def test_GLM_throws_ArrayOutOfBoundException():
    # everything in this test is important to cause the exception:
    # - GLEASON as a categorical
    # - lambda search enabled
    # - alphas    # - CV enabled
    df = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    target = "CAPSULE"
    nFold = 5
    for col in [target, 'GLEASON']:
        df[col] = df[col].asfactor()
        glm = H2OGeneralizedLinearEstimator(
            lambda_search=True,
            alpha=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
            nfolds=nFold,
            seed=12345)
        glm.train(y=target, training_frame=df)

        assert len(glm._model_json["output"]['cross_validation_models'])==nFold, \
            "expected number of cross_validation_model: {0}.  Actual number of cross_validation: " \
            "{1}".format(len(glm._model_json["output"]['cross_validation_models']), nFold)
 def construct_model(self):
     
     if(self.model_type=='C'):
         if(self.index==1):
             p_model=H2OGeneralizedLinearEstimator(**self.parameters)
         if(self.index==2):
             p_model=DecisionTreeClassifier(**self.parameters)
         if(self.index==3):
             p_model=GaussianNB(**self.parameters)
         if(self.index==4):
             p_model=SVC(**self.parameters)
         if(self.index==5):
             p_model=RandomForestClassifier(**self.parameters)
         if(self.index==6):
             p_model=GradientBoostingClassifier(**self.paraemters)
         if(self.index==7):
             p_model=ExtraTreesClassifier(**self.parameters)
         if(self.index==8):
             p_model=SGDClassifier(**self.parameters)
     else:
         if(self.index==1):
             p_model=LinearRegression(**self.parameters)
         if(self.index==2):
             p_model=DecisionTreeClassifier(**self.parameters)
         if(self.index==3):
             p_model=BayesianRidge(**self.parameters)
         if(self.index==4):
             p_model=SVR(**self.parameters)
         if(self.index==5):
             p_model=RandomForestRegressor(**self.parameters)
         if(self.index==6):
             p_model=GradientBoostingRegressor(**self.parameters)
         if(self.index==7):
             p_model=ExtraTreesRegressor(**self.parameters)			 
         if(self.index==8):
             p_model=SGDRegressor(**self.parameters)
     return p_model		 
Ejemplo n.º 14
0
def generate_and_import_combined_pojo():
    if sys.version_info[0] < 3:  # Python 2
        print("This example needs Python 3.x+")
        return

    weather_orig = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/weather.csv"))
    weather = weather_orig  # working copy

    features = list(set(weather.names) - {"Date", "RainTomorrow", "Sunshine"})
    features.sort()
    response = "RISK_MM"

    glm_model = H2OGeneralizedLinearEstimator()
    glm_model.train(x=features, y=response, training_frame=weather)
    glm_preds = glm_model.predict(weather)

    gbm_model = H2OGradientBoostingEstimator(ntrees=5)
    gbm_model.train(x=features, y=response, training_frame=weather)
    gbm_preds = gbm_model.predict(weather)

    # Drop columns that we will calculate in POJO manually (we will recreate them in POJO to be the exact same)
    weather = weather.drop("ChangeTemp")
    weather = weather.drop("ChangeTempDir")

    combined_pojo_path = generate_combined_pojo(glm_model, gbm_model)
    print("Combined POJO was stored in: " + combined_pojo_path)

    # FIXME: https://h2oai.atlassian.net/browse/PUBDEV-8561 We need to make this work for upload_mojo as well
    pojo_model = h2o.import_mojo(combined_pojo_path)

    # Testing begins

    # Sanity test - test parameterization that delegates to GLM
    weather["Bias"] = 1  # behave like GLM
    pojo_glm_preds = pojo_model.predict(weather)
    assert_frame_equal(pojo_glm_preds.as_data_frame(),
                       glm_preds.as_data_frame())

    # Sanity test - test parameterization that delegates to GBM
    weather["Bias"] = 0  # behave like GBM
    pojo_gbm_preds = pojo_model.predict(weather)
    assert_frame_equal(pojo_gbm_preds.as_data_frame(),
                       gbm_preds.as_data_frame())

    # Test per-segment specific behavior, segments are defined by ChangeWindDirect
    weather["Bias"] = float("NaN")
    for change_wind_dir in weather["ChangeWindDirect"].levels()[0]:
        weather_cwd = weather[weather["ChangeWindDirect"] == change_wind_dir]
        weather_orig_cwd = weather_orig[weather_orig["ChangeWindDirect"] ==
                                        change_wind_dir]
        pojo_weather_cwd_preds = pojo_model.predict(weather_cwd)
        if change_wind_dir == "c" or change_wind_dir == "l":
            expected = glm_model.predict(weather_orig_cwd) * 2
            assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(),
                               expected.as_data_frame())
        elif change_wind_dir == "n":
            expected = (glm_model.predict(weather_orig_cwd) +
                        gbm_model.predict(weather_orig_cwd)) / 2
            assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(),
                               expected.as_data_frame())
        elif change_wind_dir == "s":
            expected = gbm_model.predict(weather_orig_cwd)
            assert_frame_equal(pojo_weather_cwd_preds.as_data_frame(),
                               expected.as_data_frame())
Ejemplo n.º 15
0
 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = titanic_frame()
     fr["parch"] = fr["parch"].asfactor()
     model = H2OGeneralizedLinearEstimator()
     model.train(y="parch", training_frame=fr, ignored_columns=["name", "ticket", "boat", "home.dest"])
     return model
Ejemplo n.º 16
0
 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = cars_frame()
     model = H2OGeneralizedLinearEstimator()
     model.train(y="mpg", training_frame=fr, ignored_columns=["name"])
     return model
Ejemplo n.º 17
0
 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = iris_frame()
     model = H2OGeneralizedLinearEstimator()
     model.train(y="Species", training_frame=fr)
     return model
Ejemplo n.º 18
0
 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = missing_frame()
     model = H2OGeneralizedLinearEstimator()
     model.train(training_frame=fr)
     return model
Ejemplo n.º 19
0
    def train(self, x=None, y=None, training_frame=None):
        """
        Train the rulefit model.
        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :examples:
        >>> rulefit = H2ORuleFit()
        >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", 
        ...                                  col_types = {'pclass': "enum", 'survived': "enum"})
        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
        >>> rulefit.train(x=x,y="survived",training_frame=training_data)
        >>> rulefit
        """

        if (training_frame.type(y) == "enum"):
            if training_frame[y].unique().nrow > 2:
                family = "multinomial"
                raise H2OValueError("multinomial use cases not yet supported")
            else:
                family = "binomial"
        else:
            if self.glm_params.get("family") is not None:
                family = self.glm_params.get("family")
                self.glm_params.pop("family")
            else:
                family = "gaussian"

        # Get paths from random forest models
        paths_frame = training_frame[y]
        depths = range(self.min_rule_len, self.max_rule_len + 1)
        tree_models = dict()
        for model_idx in range(len(depths)):

            # Train tree models
            tree_model = _tree_model(self.algorithm, depths[model_idx],
                                     self.seed, model_idx, self.tree_params)
            tree_model.train(y=y, x=x, training_frame=training_frame)
            tree_models[model_idx] = tree_model

            paths = tree_model.predict_leaf_node_assignment(training_frame)
            paths.col_names = [
                "tree_{0}.{1}".format(str(model_idx), x)
                for x in paths.col_names
            ]
            paths_frame = paths_frame.cbind(paths)

        if self.max_num_rules:
            # Train GLM with chosen lambda
            glm = H2OGeneralizedLinearEstimator(
                model_id="glm.hex",
                seed=self.seed,
                family=family,
                alpha=1,
                max_active_predictors=self.max_num_rules + 1,
                **self.glm_params)
            glm.train(y=y, training_frame=paths_frame)

        else:
            # Get optimal lambda
            glm = H2OGeneralizedLinearEstimator(model_id="glm.hex",
                                                nfolds=self.nfolds,
                                                seed=self.seed,
                                                family=family,
                                                alpha=1,
                                                lambda_search=True,
                                                **self.glm_params)
            glm.train(y=y, training_frame=paths_frame)

            lambda_ = _get_glm_lambda(glm)

            # Train GLM with chosen lambda
            glm = H2OGeneralizedLinearEstimator(model_id="glm.hex",
                                                seed=self.seed,
                                                family=family,
                                                alpha=1,
                                                lambda_=lambda_,
                                                solver="COORDINATE_DESCENT",
                                                **self.glm_params)
            glm.train(y=y, training_frame=paths_frame)

        # Get Intercept
        intercept = _get_intercept(glm)

        # Get Rules
        rule_importance = _get_rules(glm, tree_models, self.algorithm)

        self.intercept = intercept
        self.rule_importance = rule_importance
        self.glm = glm
        self.tree_models = tree_models
Ejemplo n.º 20
0
    def train(self,
              x=None,
              y=None,
              training_frame=None,
              offset_column=None,
              fold_column=None,
              weights_column=None,
              validation_frame=None,
              **params):
        """
        Train the rulefit model.
        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :examples:
        >>> rulefit = H2ORuleFit()
        >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", 
        ...                                  col_types = {'pclass': "enum", 'survived': "enum"})
        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
        >>> rulefit.train(x=x,y="survived",training_frame=training_data)
        >>> rulefit
        """
        family = "gaussian"
        if (training_frame.type(y) == "enum"):
            if training_frame[y].unique().nrow > 2:
                family = "multinomial"
            else:
                family = "binomial"

        # Get paths from random forest models
        paths_frame = training_frame[y]
        depths = range(self.min_depth, self.max_depth + 1)
        rf_models = dict()
        for model_idx in range(len(depths)):

            # Train random forest models
            rf_model = H2ORandomForestEstimator(seed=self.seed,
                                                model_id="rf_{}.hex".format(
                                                    str(model_idx)),
                                                max_depth=depths[model_idx])
            rf_model.train(y=y, x=x, training_frame=training_frame)
            rf_models[model_idx] = rf_model

            paths = rf_model.predict_leaf_node_assignment(training_frame)
            paths.col_names = [
                "rf_{0}.{1}".format(str(model_idx), x) for x in paths.col_names
            ]
            paths_frame = paths_frame.cbind(paths)

        # Extract important paths
        glm = H2OGeneralizedLinearEstimator(model_id="glm.hex",
                                            nfolds=self.nfolds,
                                            seed=self.seed,
                                            family=family,
                                            alpha=1,
                                            remove_collinear_columns=True,
                                            lambda_search=True)
        glm.train(y=y, training_frame=paths_frame)

        lambda_ = _get_glm_lambda(glm, self.num_rules)

        # Train GLM with chosen lambda
        glm = H2OGeneralizedLinearEstimator(model_id="glm.hex",
                                            seed=self.seed,
                                            family=family,
                                            alpha=1,
                                            remove_collinear_columns=True,
                                            lambda_=lambda_,
                                            solver="COORDINATE_DESCENT")
        glm.train(y=y, training_frame=paths_frame)

        # Get Intercept
        intercept = _get_intercept(glm)

        # Get Rules
        rule_importance = _get_rules(glm, rf_models)

        self.intercept = intercept
        self.rule_importance = rule_importance
        self.glm = glm
        self.rf_models = rf_models
Ejemplo n.º 21
0
 def bake(self) -> H2OGeneralizedLinearEstimator:
     fr = eyestate_frame()
     model = H2OGeneralizedLinearEstimator()
     model.train(y="eyeDetection", training_frame=fr)
     return model
Ejemplo n.º 22
0
    def train(self, x=None, y=None, training_frame=None, offset_column=None, fold_column=None, weights_column=None,
              validation_frame=None, **params):
        """
        Train the rulfit model.
        :param x: A list of column names or indices indicating the predictor columns.
        :param y: An index or a column name indicating the response column.
        :param training_frame: The H2OFrame having the columns indicated by x and y (as well as any
            additional columns specified by fold, offset, and weights).
        :examples:
        >>> rulefit = H2ORuleFit()
        >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", 
        ...                                  col_types = {'pclass': "enum", 'survived': "enum"})
        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
        >>> rulefit.train(x=x,y="survived",training_frame=training_data)
        >>> rulefit
        """
        family = "gaussian"
        if (training_frame.type(y) == "enum"):
            if training_frame[y].unique().nrow > 2:
                raise H2OValueError("Multinomial not supported")
            else:
                family = "binomial"


        # Get paths from random forest models
        paths_frame = training_frame[y]
        depths = range(self.min_depth, self.max_depth + 1)
        rf_models = []
        for model_idx in range(len(depths)):

            # Train random forest models
            rf_model = H2ORandomForestEstimator(seed = self.seed, 
                                                model_id = "rf.hex", 
                                                max_depth = depths[model_idx])
            rf_model.train(y = y, x = x, training_frame = training_frame)
            rf_models = rf_models + [rf_model]

            paths = rf_model.predict_leaf_node_assignment(training_frame)
            paths.col_names = ["rf_" + str(model_idx) +"."+ x for x in paths.col_names]
            paths_frame = paths_frame.cbind(paths)

        # Extract important paths
        glm = H2OGeneralizedLinearEstimator(model_id = "glm.hex", 
                                            nfolds = self.nfolds, 
                                            seed = self.seed,
                                            family = family,
                                            alpha = 1, 
                                            remove_collinear_columns=True,
                                            lambda_search = True)
        glm.train(y = y, training_frame=paths_frame)

        intercept, rule_importance = _get_glm_coeffs(glm)
        rule_importance = pd.DataFrame.from_dict(rule_importance, orient = "index").reset_index()
        rule_importance.columns = ["variable", "coefficient"]

        # Convert paths to rules
        rules = []
        for i in rule_importance.variable:
            if family == "binomial":
                model_num, tree_num, path = i.replace("rf_", "").replace("T", "").replace("C1.", "").split(".")
            else:
                model_num, tree_num, path = i.replace("rf_", "").replace("T", "").split(".")
            tree = H2OTree(rf_models[int(model_num)], int(tree_num)-1)
            rules = rules + [_tree_traverser(tree.root_node, path)]

        # Add rules and order by absolute coefficient
        rule_importance["rule"] = rules
        rule_importance["abs_coefficient"] = rule_importance["coefficient"].abs()
        rule_importance = rule_importance.loc[rule_importance.groupby(["rule"])["abs_coefficient"].idxmax()]  
        rule_importance = rule_importance.sort_values(by = "abs_coefficient", ascending = False)
        rule_importance = rule_importance.drop("abs_coefficient", axis = 1)
        
        self.intercept = intercept
        self.rule_importance = rule_importance
Ejemplo n.º 23
0
testing_frame = ProcessData.testData(moving_average=True, standard_deviation=True, probability_from_file=True)

# create h2o frames
train = h2o.H2OFrame(training_frame)
test = h2o.H2OFrame(testing_frame)
train.set_names(list(training_frame.columns))
test.set_names(list(testing_frame.columns))

# Feature selection
training_columns = list(training_frame.columns)
training_columns.remove(response_column)
training_columns.remove("UnitNumber")
training_columns.remove("Time")

# Build model
model4 = H2OGeneralizedLinearEstimator()

# Train model
model4.train(x=training_columns, y=response_column, training_frame=train)

# End : Generalized Linear Modeling
# ----------------------------------------------------------------------------------------------------------------------

# Prediction
# ----------------------------------------------------------------------------------------------------------------------
print "Begin Prdiction"
print "---------------"

# ground truth
tY = np.array(testing_frame['RUL'])
Ejemplo n.º 24
0
response_column = Dataset.RESPONSE_COLUMN
input_columns.remove('city')

# Start h2o server
h2o.init()

# Create h2o frame
training_frame = h2o.H2OFrame(pd_train)
training_frame.set_names(list(pd_train.columns))

# Measurements
mae = [] # Mean Absolute Errors for model
rmse = [] # Root Mean Squared Errors for model

for i in range(n_iterations):
    model = H2OGeneralizedLinearEstimator(nfolds=10)
    model.train(x=input_columns, y=response_column, training_frame=training_frame)

    mae.append(model.mae())
    rmse.append(model.rmse())

print("Model : Single")
print("--------------")
print("Average MAE       : " + str(numpy.average(mae)))
print("Average RMSE      : " + str(numpy.average(rmse)))
print("MAE Standard Dev  : " + str(numpy.std(mae)))
print("RMSE Standard Dev : " + str(numpy.std(rmse)))