コード例 #1
0
def stackedensemble_metalearner_seed_test():

    # Import training set
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_train_5k.csv"),
                            destination_frame="higgs_train_5k")
    test = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"),
                           destination_frame="higgs_test_5k")

    # Identify predictors and response
    x = train.columns
    y = "response"
    x.remove(y)

    # Convert response to a factor
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()

    # Set number of folds for base learners
    nfolds = 3

    #Metalearner params for gbm, drf, glm, and deep deeplearning
    gbm_params = {"sample_rate" : 0.3, "col_sample_rate" : 0.3}

    # Train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                          ntrees=10,
                                          nfolds=nfolds,
                                          keep_cross_validation_predictions=True,
                                          seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)

    # Train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=10,
                                     nfolds=nfolds,
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)

    #Train two SE models with same metalearner seeds
    stack_gbm1 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
                                            metalearner_params = gbm_params, seed = 55555)
    stack_gbm2 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
                                            metalearner_params = gbm_params, seed = 55555)
    stack_gbm1.train(x=x, y=y, training_frame=train)
    stack_gbm2.train(x=x, y=y, training_frame=train)
    meta_gbm1 = h2o.get_model(stack_gbm1.metalearner()['name'])
    meta_gbm2 = h2o.get_model(stack_gbm2.metalearner()['name'])

    assert meta_gbm1.rmse(train=True) == meta_gbm2.rmse(train=True), "RMSE should match if same seed"

    #Train two SE models with diff metalearner seeds
    stack_gbm3 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
                                             metalearner_params = gbm_params, seed = 55555)
    stack_gbm4 = H2OStackedEnsembleEstimator(base_models=[my_gbm, my_rf], metalearner_algorithm="gbm",
                                             metalearner_params = gbm_params, seed = 98765)
    stack_gbm3.train(x=x, y=y, training_frame=train)
    stack_gbm4.train(x=x, y=y, training_frame=train)
    meta_gbm3 = h2o.get_model(stack_gbm3.metalearner()['name'])
    meta_gbm4 = h2o.get_model(stack_gbm4.metalearner()['name'])
    assert meta_gbm3.rmse(train=True) != meta_gbm4.rmse(train=True), "RMSE should NOT match if diff seed"
コード例 #2
0
def covtype_get_model():
  covtype = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))

  Y = 54
  X = range(0,20) + range(29,54)

  # Set response to be indicator of a particular class
  res_class = random.randint(1,4)
  # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n"))
  covtype[54] = (covtype[54] == res_class)

  # L2: alpha = 0, lambda = 0

  covtype_mod1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=0)
  covtype_mod1.train(x=X,y=Y, training_frame=covtype)
  covtype_mod1.show()
  covtype_mod1 = h2o.get_model(covtype_mod1.model_id)
  covtype_mod1.show()

  # Elastic: alpha = 0.5, lambda = 1e-4
  covtype_mod2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=1e-4)
  covtype_mod2.train(x=X, y=Y, training_frame=covtype)
  covtype_mod2.show()
  covtype_mod2 = h2o.get_model(covtype_mod2.model_id)
  covtype_mod2.show()

  # L1: alpha = 1, lambda = 1e-4
  covtype_mod3 = H2OGeneralizedLinearEstimator(family="binomial", alpha=1, Lambda=1e-4)
  covtype_mod3.train(x=X,y=Y, training_frame=covtype)
  covtype_mod3.show()
  covtype_mod3 = h2o.get_model(covtype_mod3.model_id)
  covtype_mod3.show()
コード例 #3
0
def covtype_get_model(ip,port):
    
    

    #Log.info("Importing covtype.20k.data...\n")
    covtype = h2o.import_file(path=h2o.locate("smalldata/covtype/covtype.20k.data"))

    Y = 54
    X = range(0,20) + range(29,54)

    # Set response to be indicator of a particular class
    res_class = random.randint(1,4)
    # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n"))
    covtype[54] = (covtype[54] == res_class)

    #covtype_data.summary()

    # L2: alpha = 0, lambda = 0
    covtype_mod1 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0], Lambda=[0])
    covtype_mod1.show()
    covtype_mod1 = h2o.get_model(covtype_mod1._id)
    covtype_mod1.show()

    # Elastic: alpha = 0.5, lambda = 1e-4
    covtype_mod2 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[0.5], Lambda=[1e-4])
    covtype_mod2.show()
    covtype_mod2 = h2o.get_model(covtype_mod2._id)
    covtype_mod2.show()

    # L1: alpha = 1, lambda = 1e-4
    covtype_mod3 = h2o.glm(y=covtype[Y], x=covtype[X], family="binomial", alpha=[1], Lambda=[1e-4])
    covtype_mod3.show()
    covtype_mod3 = h2o.get_model(covtype_mod3._id)
    covtype_mod3.show()
コード例 #4
0
def get_model_test():
    prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.70]

    # Regression
    regression_gbm1 = H2OGradientBoostingEstimator(distribution="gaussian")
    regression_gbm1.train(x=[2,3,4,5,6,7,8], y=1, training_frame=train)
    predictions1 = regression_gbm1.predict(test)

    regression_gbm2 = h2o.get_model(regression_gbm1._id)
    assert regression_gbm2._model_json['output']['model_category'] == "Regression"
    predictions2 = regression_gbm2.predict(test)

    for r in range(predictions1.nrow):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected regression predictions to be the same for row {}, but got {} and {}".format(r, p1, p2)

    # Binomial
    train[1] = train[1].asfactor()
    bernoulli_gbm1 = H2OGradientBoostingEstimator(distribution="bernoulli")

    bernoulli_gbm1.train(x=[2,3,4,5,6,7,8],y=1,training_frame=train)
    predictions1 = bernoulli_gbm1.predict(test)

    bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._id)
    assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial"
    predictions2 = bernoulli_gbm2.predict(test)

    for r in range(predictions1.nrow):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected binomial predictions to be the same for row {}, but got {} and {}".format(r, p1, p2)

    # Clustering
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    km_h2o = H2OKMeansEstimator(k=3)
    km_h2o.train(x=list(range(benign_h2o.ncol)), training_frame=benign_h2o)
    benign_km = h2o.get_model(km_h2o._id)
    assert benign_km._model_json['output']['model_category'] == "Clustering"

    # Multinomial
    train[4] = train[4].asfactor()
    multinomial_dl1 = H2ODeepLearningEstimator(loss="CrossEntropy")
    multinomial_dl1.train(x=[0,1], y=4, training_frame=train)
    predictions1 = multinomial_dl1.predict(test)

    multinomial_dl2 = h2o.get_model(multinomial_dl1._id)
    assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial"
    predictions2 = multinomial_dl2.predict(test)

    for r in range(predictions1.nrow):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)
コード例 #5
0
ファイル: pyunit_get_model.py プロジェクト: cursedninja/h2o-3
def get_model_test(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    prostate = h2o.import_frame(path=h2o.locate("smalldata/logreg/prostate.csv"))

    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.30]

    # Regression
    regression_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="gaussian")
    predictions1 = regression_gbm1.predict(test)

    regression_gbm2 = h2o.get_model(regression_gbm1._key)
    assert regression_gbm2._model_json['output']['model_category'] == "Regression"
    predictions2 = regression_gbm2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected regression predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)

    # Binomial
    train[1] = train[1].asfactor()
    bernoulli_gbm1 = h2o.gbm(y=train[1], x=train[2:9], distribution="bernoulli")
    predictions1 = bernoulli_gbm1.predict(test)

    bernoulli_gbm2 = h2o.get_model(bernoulli_gbm1._key)
    assert bernoulli_gbm2._model_json['output']['model_category'] == "Binomial"
    predictions2 = bernoulli_gbm2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected binomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)

    # Clustering
    benign_h2o = h2o.import_frame(path=h2o.locate("smalldata/logreg/benign.csv"))
    km_h2o = h2o.kmeans(x=benign_h2o, k=3)
    benign_km = h2o.get_model(km_h2o._key)
    assert benign_km._model_json['output']['model_category'] == "Clustering"

    # Multinomial
    train[4] = train[4].asfactor()
    multinomial_dl1 = h2o.deeplearning(x=train[0:2], y=train[4], loss='CrossEntropy')
    predictions1 = multinomial_dl1.predict(test)

    multinomial_dl2 = h2o.get_model(multinomial_dl1._key)
    assert multinomial_dl2._model_json['output']['model_category'] == "Multinomial"
    predictions2 = multinomial_dl2.predict(test)

    for r in range(predictions1.nrow()):
        p1 = predictions1[r,0]
        p2 = predictions2[r,0]
        assert p1 == p2, "expected multinomial predictions to be the same for row {0}, but got {1} and {2}" \
                         "".format(r, p1, p2)
コード例 #6
0
    def get_xval_models(self, key=None):
        """
    Return a Model object.

    :param key: If None, return all cross-validated models; otherwise return the model that key points to.
    :return: A model or list of models.
    """
        return h2o.get_model(key) if key is not None else [h2o.get_model(k) for k in self._xval_keys]
コード例 #7
0
 def test_param_disabled():
     print("\n=== disabling "+kcvp+" ===")
     aml = setup_and_train(False)
     _, non_se, se = get_partitioned_model_names(aml.leaderboard)
     keys = list_keys_in_memory()
     preds = len(keys['cv_predictions'])
     assert preds == 0, "{preds} CV predictions were not cleaned from memory".format(preds=preds)
     for m in non_se:
         assert_cv_predictions_on_model(m, False)
     for m in se:
         assert not h2o.get_model(h2o.get_model(m).metalearner()['name']).cross_validation_predictions()
コード例 #8
0
    def get_grid(self, sort_by=None, decreasing=None):
        """
        Retrieve an H2OGridSearch instance. Optionally specify a metric by which to sort models and a sort order. 
        Note that if neither cross-validation nor a validation frame is used in the grid search, then the 
        training metrics will display in the "get grid" output. If a validation frame is passed to the grid, and 
        ``nfolds = 0``, then the validation metrics will display. However, if ``nfolds`` > 1, then cross-validation 
        metrics will display even if a validation frame is provided.

        Parameters
        ----------
        sort_by : str, optional
          A metric by which to sort the models in the grid space. Choices are "logloss", "residual_deviance", "mse",
          "auc", "r2", "accuracy", "precision", "recall", "f1", etc.
        decreasing : bool, optional
          Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default).
        Returns
        -------
          A new H2OGridSearch instance optionally sorted on the specified metric.

        """
        if sort_by is None and decreasing is None: return self

        grid_json = h2o.api("GET /99/Grids/%s" % self._id, data={"sort_by": sort_by, "decreasing": decreasing})
        grid = H2OGridSearch(self.model, self.hyper_params, self._id)
        grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]  # reordered
        first_model_json = h2o.api("GET /99/Models/%s" % grid_json['model_ids'][0]['name'])['models'][0]
        model_class = H2OGridSearch._metrics_class(first_model_json)
        m = model_class()
        m._id = self._id
        m._grid_json = grid_json
        # m._metrics_class = metrics_class
        m._parms = grid._parms
        H2OEstimator.mixin(grid, model_class)
        grid.__dict__.update(m.__dict__.copy())
        return grid
コード例 #9
0
    def predict(self, test_data):
        """
        Predict on a dataset.

        :param H2OFrame test_data: Data on which to make predictions.

        :returns: A new H2OFrame of predictions.

        :examples:
        >>> #Set up an H2OAutoML object
        >>> build_control = {
        >>>              'stopping_criteria': {
        >>>              'stopping_rounds': 3,
        >>>              'stopping_tolerance': 0.001
        >>>            }
        >>>        }
        >>> aml = H2OAutoML(max_runtime_secs=30, build_control=build_control)
        >>> # Launch H2OAutoML
        >>> aml.train(y=y, training_frame=training_frame)
        >>> #Predict with #1 model from H2OAutoML leaderboard
        >>> aml.predict(test_data)

        """
        if self._fetch():
            self._model = h2o.get_model(self._leader_id)
            return self._model.predict(test_data)
        print("No model built yet...")
コード例 #10
0
def test_workaround_for_distribution():
    try:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format(
            "sys.ai.h2o.automl.algo_parameters.all.enabled", "true"))
        ds = import_dataset('regression')
        aml = H2OAutoML(project_name="py_test",
                        algo_parameters=dict(
                            distribution='poisson',
                            family='poisson',
                        ),
                        exclude_algos=['StackedEnsemble'],
                        max_runtime_secs=60,
                        seed=1)
        aml.train(y=ds.target, training_frame=ds.train)
        model_names = [
            aml.leaderboard[i, 0] for i in range(0, (aml.leaderboard.nrows))
        ]
        for mn in model_names:
            m = h2o.get_model(mn)
            dist = m.params[
                'distribution'] if 'distribution' in m.params else m.params[
                    'family'] if 'family' in m.params else None
            print("{}: distribution = {}".format(mn, dist))
    except:
        h2o.rapids("(setproperty \"{}\" \"{}\")".format(
            "sys.ai.h2o.automl.algo_parameters.all.enabled", "false"))
def grid_resume():
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Run GBM Grid Search
    ntrees_opts = [1, 3]
    learn_rate_opts = [0.1, .05]
    hyper_parameters = OrderedDict()
    hyper_parameters["learn_rate"] = learn_rate_opts
    hyper_parameters["ntrees"] = ntrees_opts
    print("GBM grid with the following hyper_parameters:", hyper_parameters)
    
    export_dir = pyunit_utils.locate("results")
    gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=train)
    grid_id = gs.grid_id
    old_grid_model_count = len(gs.model_ids)
    print("Baseline grid has %d models" % old_grid_model_count)
    saved_path = h2o.save_grid(export_dir, grid_id)
    h2o.remove_all()

    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    grid = h2o.load_grid(saved_path)
    assert grid is not None
    assert len(grid.model_ids) == old_grid_model_count
    # Modify the hyperspace - should add new models to the grid
    hyper_parameters["ntrees"] = [2, 5]
    grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, grid_id = grid.grid_id)
    grid.train(x=list(range(4)), y=4, training_frame=train)
    print("Newly grained grid has %d models" % len(grid.model_ids))
    assert len(grid.model_ids) == 2 * old_grid_model_count
    
    for model_id in grid.model_ids:
        model = h2o.get_model(model_id)
        assert model is not None
コード例 #12
0
def test_algo_parameter_can_be_applied_only_to_a_specific_algo():
    ds = import_dataset()
    aml = H2OAutoML(
        project_name="py_specific_algo_param",
        algo_parameters=dict(GBM__monotone_constraints=dict(AGE=1)),
        max_models=6,
        seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    model_names, _, _ = get_partitioned_model_names(aml.leaderboard)
    models_supporting_monotone_constraints = [
        n for n in model_names if re.match(r"GBM|XGBoost", n)
    ]
    assert next((m for m in models_supporting_monotone_constraints
                 if m.startswith('GBM')),
                None), "There should be at least one GBM model"
    for m in models_supporting_monotone_constraints:
        model = h2o.get_model(m)
        mc_value = next(v['actual'] for n, v in model.params.items()
                        if n == 'monotone_constraints')
        if m.startswith('GBM'):
            assert isinstance(mc_value, list)
            age = next((v for v in mc_value if v['key'] == 'AGE'), None)
            assert age is not None
            assert age['value'] == 1.0
        else:
            assert mc_value is None
コード例 #13
0
def get_modelKmeans(ip, port):
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    #Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(
        path=h2o.locate("smalldata/logreg/benign.csv"))
    #benign_h2o.summary()

    benign_sci = np.genfromtxt(h2o.locate("smalldata/logreg/benign.csv"),
                               delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2, 7):
        # Log.info("H2O K-Means")
        km_h2o = h2o.kmeans(x=benign_h2o, k=i)
        km_h2o.show()
        model = h2o.get_model(km_h2o._id)
        model.show()

        km_sci = KMeans(n_clusters=i, init='k-means++', n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_
コード例 #14
0
def test_automl_creates_interpretable_SE_with_only_monotonic_models():
    ds = import_dataset()
    aml_mono = H2OAutoML(
        project_name="test_automl_creates_interpretable_se",
        max_models=5,
        include_algos=["GBM", "GLM", "XGBoost", "StackedEnsemble"],
        monotone_constraints=dict(AGE=1,
                                  DPROS=1,
                                  DCAPS=1,
                                  PSA=1,
                                  VOL=1,
                                  GLEASON=1),
        seed=1234)
    aml_mono.train(y=ds.target, training_frame=ds.train)

    leaderboard = (aml_mono.leaderboard.as_data_frame()["model_id"])

    assert leaderboard.apply(
        lambda model_name: "Monotonic" in model_name).any()

    se_name = leaderboard[leaderboard.apply(
        lambda model_name: "Monotonic" in model_name)]
    se_mono = h2o.get_model(se_name.iloc[0])

    assert leaderboard.apply(lambda model_name: 'GLM' in model_name).any()
    assert all(['GBM' in bm or 'XGBoost' in bm for bm in se_mono.base_models])
コード例 #15
0
 def test_param_disabled():
     print("\n=== disabling "+kcvm+" ===")
     aml = setup_and_train(False)
     models, non_se, se = get_partitioned_model_names(aml.leaderboard)
     check_model_property(se, kcvm, False)
     check_model_property(non_se, kcvm, True, False, True)
     keys = list_keys_in_memory()
     tot, cv = len(keys['models']), len(keys['cv_models'])
     print("total models in memory = {tot}, among which {cv} CV models".format(tot=tot, cv=cv))
     assert tot > 0, "no models left in memory"
     assert cv == 0, "{cv} CV models were not cleaned from memory".format(cv=cv)
     for m in non_se:
         assert not h2o.get_model(m).cross_validation_models(), "unexpected cv models for model "+m
     for m in se:
         metal = h2o.get_model(h2o.get_model(m).metalearner()['name'])
         assert not metal.cross_validation_models(), "unexpected cv models for metalearner of model "+m
コード例 #16
0
def test_monotone_constraints_can_be_passed_as_algo_parameter():
    ds = import_dataset()
    aml = H2OAutoML(
        project_name="py_monotone_constraints",
        algo_parameters=dict(
            monotone_constraints=dict(
                AGE=1, VOL=-1),  # constraints just for the sake of testing
            # ntrees=10,
        ),
        max_models=6,
        seed=1)
    aml.train(y=ds.target, training_frame=ds.train)
    model_names, _, _ = get_partitioned_model_names(aml.leaderboard)
    models_supporting_monotone_constraints = [
        n for n in model_names if re.match(r"GBM|XGBoost", n)
    ]
    assert len(models_supporting_monotone_constraints) < len(model_names), \
        "models not supporting the constraint should not have been skipped"
    for m in models_supporting_monotone_constraints:
        model = h2o.get_model(m)
        value = next(v['actual'] for n, v in model.params.items()
                     if n == 'monotone_constraints')
        # print(param)
        assert isinstance(value, list)
        assert len(value) == 2
        age = next((v for v in value if v['key'] == 'AGE'), None)
        assert age is not None
        assert age['value'] == 1.0
        vol = next((v for v in value if v['key'] == 'VOL'), None)
        assert vol is not None
        assert vol['value'] == -1.0
コード例 #17
0
ファイル: autoh2o.py プロジェクト: veronicaalagia/h2o-3
    def _fetch_state(aml_id, properties=None):
        state_json = h2o.api("GET /99/AutoML/%s" % aml_id)
        project_name = state_json["project_name"]
        if project_name is None:
            raise H2OValueError("No AutoML instance with id {}.".format(aml_id))

        leaderboard_list = [key["name"] for key in state_json['leaderboard']['models']]
        leader_id = leaderboard_list[0] if (leaderboard_list is not None and len(leaderboard_list) > 0) else None

        should_fetch = lambda prop: properties is None or prop in properties

        leader = None
        if should_fetch('leader'):
            leader = h2o.get_model(leader_id) if leader_id is not None else None

        leaderboard = None
        if should_fetch('leaderboard'):
            leaderboard = H2OAutoML._fetch_table(state_json['leaderboard_table'], key=project_name+"_leaderboard", progress_bar=False)
            leaderboard = h2o.assign(leaderboard[1:], project_name+"_leaderboard")  # removing index and reassign id to ensure persistence on backend

        event_log = None
        if should_fetch('event_log'):
            event_log = H2OAutoML._fetch_table(state_json['event_log_table'], key=project_name+"_eventlog", progress_bar=False)
            event_log = h2o.assign(event_log[1:], project_name+"_eventlog")  # removing index and reassign id to ensure persistence on backend

        return dict(
            project_name=project_name,
            json=state_json,
            leader_id=leader_id,
            leader=leader,
            leaderboard=leaderboard,
            event_log=event_log,
        )
コード例 #18
0
 def test_param_disabled():
     print("\n=== disabling "+kcvm+" ===")
     aml = setup_and_train(False)
     models, non_se, se = get_partitioned_model_names(aml.leaderboard)
     check_model_property(se, kcvm, False)
     check_model_property(non_se, kcvm, True, False, True)
     keys = list_keys_in_memory()
     tot, cv = len(keys['models_all']), len(keys['cv_models'])
     print("total models in memory = {tot}, among which {cv} CV models".format(tot=tot, cv=cv))
     assert tot > 0, "no models left in memory"
     assert cv == 0, "{cv} CV models were not cleaned from memory".format(cv=cv)
     for m in non_se:
         assert not h2o.get_model(m).cross_validation_models(), "unexpected cv models for model "+m
     for m in se:
         metal = h2o.get_model(h2o.get_model(m).metalearner()['name'])
         assert not metal.cross_validation_models(), "unexpected cv models for metalearner of model "+m
コード例 #19
0
def get_modelKmeans():
    # Connect to a pre-existing cluster
    # connect to localhost:54321

    # Log.info("Importing benign.csv data...\n")
    benign_h2o = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    # benign_h2o.summary()

    benign_sci = np.genfromtxt(pyunit_utils.locate("smalldata/logreg/benign.csv"), delimiter=",")
    # Impute missing values with column mean
    imp = Imputer(missing_values="NaN", strategy="mean", axis=0)
    benign_sci = imp.fit_transform(benign_sci)

    for i in range(2, 7):
        # Log.info("H2O K-Means")
        km_h2o = H2OKMeansEstimator(k=i)
        km_h2o.train(x=range(benign_h2o.ncol), training_frame=benign_h2o)
        km_h2o.show()
        model = h2o.get_model(km_h2o._id)
        model.show()

        km_sci = KMeans(n_clusters=i, init="k-means++", n_init=1)
        km_sci.fit(benign_sci)
        print "sckit centers"
        print km_sci.cluster_centers_
コード例 #20
0
ファイル: grid_search.py プロジェクト: ryanallen82/h2o-3
  def get_grid(self, sort_by=None, decreasing=None):
    """
    Retrieve an H2OGridSearch instance. Optionally specify a metric by which to sort models and a sort order.  
    
    Parameters
    ----------    
    sort_by : str, optional
      A metric by which to sort the models in the grid space. Choices are "logloss", "residual_deviance", "mse", "auc", "r2", "accuracy", "precision", "recall", "f1", etc.
    decreasing : bool, optional
      Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default).
    Returns
    -------
      A new H2OGridSearch instance optionally sorted on the specified metric.

    """
    if sort_by is None and decreasing is None: return self

    grid_json = H2OConnection.get_json("Grids/"+self._id, sort_by=sort_by, decreasing=decreasing, _rest_version=99)
    grid = H2OGridSearch(self.model, self.hyper_params, self._id)
    grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']] #reordered
    first_model_json = H2OConnection.get_json("Models/"+grid_json['model_ids'][0]['name'], _rest_version=99)['models'][0]
    model_class = H2OGridSearch._metrics_class(first_model_json)
    m = model_class()
    m._id = self._id
    m._grid_json = grid_json
    # m._metrics_class = metrics_class
    m._parms = grid._parms
    H2OEstimator.mixin(grid,model_class)
    grid.__dict__.update(m.__dict__.copy())
    return grid
コード例 #21
0
def test_base_models_are_populated():
    train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"))
    x = train.columns
    y = "species"
    x.remove(y)

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    gbm.train(x=x, y=y, training_frame=train)
    rf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True)
    rf.train(x=x, y=y, training_frame=train)
    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     validation_frame=test,
                                     base_models=[gbm.model_id, rf.model_id])
    se.train(x=x, y=y, training_frame=train)
    retrieved_se = get_model(se.model_id)

    assert len(se.base_models) == 2
    assert len(retrieved_se.base_models) == 2
    assert se.base_models == retrieved_se.base_models
    # ensure that we are getting the model_ids
    assert pu.is_type(se.base_models, [str])
    assert pu.is_type(retrieved_se.base_models, [str])
コード例 #22
0
def save_model(model_id, dest_dir='.', mformat='mojo'):
    model = h2o.get_model(model_id)
    if mformat == 'mojo':
        model.save_mojo(path=dest_dir)
        # model.download_mojo(path=dest_dir, get_genmodel_jar=True)
    else:
        model.save_model_details(path=dest_dir)
コード例 #23
0
ファイル: grid_search.py プロジェクト: shyamkg/h2o-3
    def get_hyperparams_dict(self, id, display=True):
        """
        Derived and returned the model parameters used to train the particular grid search model.

        Parameters
        ----------
        id: str
          The model id of the model with hyperparameters of interest.
        display: boolean
          Flag to indicate whether to display the hyperparameter names.

        Returns
        -------
          A dict of model pararmeters derived from the hyper-parameters used to train this particular model.
        """
        idx = id if is_int(id) else self.model_ids.index(id)
        model = self[idx]

        model_params = dict()

        # if cross-validation is turned on, parameters in one of the fold model actual contains the max_runtime_secs
        # parameter and not the main model that is returned.
        if model._is_xvalidated:
            model = h2o.get_model(model._xval_keys[0])

        for param_name in self.hyper_names:
            model_params[param_name] = model.params[param_name]['actual'][0] if \
                isinstance(model.params[param_name]['actual'], list) else model.params[param_name]['actual']

        if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']')
        return model_params
コード例 #24
0
ファイル: grid_search.py プロジェクト: madmax983/h2o-3
  def _model_build(self, x, y, tframe, vframe, kwargs):
    kwargs['training_frame'] = tframe
    if vframe is not None: kwargs["validation_frame"] = vframe
    if isinstance(y, int): y = tframe.names[y]
    if y is not None: kwargs['response_column'] = y
    if not isinstance(x, (list,tuple)): x=[x]
    if isinstance(x[0], int):
      x = [tframe.names[i] for i in x]
    offset = kwargs["offset_column"]
    folds  = kwargs["fold_column"]
    weights= kwargs["weights_column"]
    ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights]))
    kwargs["ignored_columns"] = None if ignored_columns==[] else [h2o.h2o._quoted(col) for col in ignored_columns]
    kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None])  # gruesome one-liner
    algo = self.model._compute_algo()  #unique to grid search
    kwargs["_rest_version"] = 99  #unique to grid search

    grid = H2OJob(H2OConnection.post_json("Grid/"+algo, **kwargs), job_type=(algo+" Grid Build"))

    if self._future:
      self._job = grid
      return

    grid.poll()
    if '_rest_version' in kwargs.keys(): grid_json = H2OConnection.get_json("Grids/"+grid.dest_key, _rest_version=kwargs['_rest_version'])
    else:                                grid_json = H2OConnection.get_json("Grids/"+grid.dest_key)

    self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]
    #get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
    first_model_json = H2OConnection.get_json("Models/"+grid_json['model_ids'][0]['name'], _rest_version=kwargs['_rest_version'])['models'][0]

    self._resolve_grid(grid.dest_key, grid_json, first_model_json)
コード例 #25
0
ファイル: grid_search.py プロジェクト: shyamkg/h2o-3
    def get_grid(self, sort_by=None, decreasing=None):
        """
        Retrieve an H2OGridSearch instance. Optionally specify a metric by which to sort models and a sort order.

        Parameters
        ----------
        sort_by : str, optional
          A metric by which to sort the models in the grid space. Choices are "logloss", "residual_deviance", "mse",
          "auc", "r2", "accuracy", "precision", "recall", "f1", etc.
        decreasing : bool, optional
          Sort the models in decreasing order of metric if true, otherwise sort in increasing order (default).
        Returns
        -------
          A new H2OGridSearch instance optionally sorted on the specified metric.

        """
        if sort_by is None and decreasing is None: return self

        grid_json = h2o.api("GET /99/Grids/%s" % self._id, data={"sort_by": sort_by, "decreasing": decreasing})
        grid = H2OGridSearch(self.model, self.hyper_params, self._id)
        grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]  # reordered
        first_model_json = h2o.api("GET /99/Models/%s" % grid_json['model_ids'][0]['name'])['models'][0]
        model_class = H2OGridSearch._metrics_class(first_model_json)
        m = model_class()
        m._id = self._id
        m._grid_json = grid_json
        # m._metrics_class = metrics_class
        m._parms = grid._parms
        H2OEstimator.mixin(grid, model_class)
        grid.__dict__.update(m.__dict__.copy())
        return grid
コード例 #26
0
ファイル: grid_search.py プロジェクト: ntabgoba/h2o-3
    def get_hyperparams_dict(self, id, display=True):
        """
    Derived and returned the model parameters used to train the particular grid search model.

    Parameters
    ----------
    id: str
      The model id of the model with hyperparameters of interest.
    display: boolean
      Flag to indicate whether to display the hyperparameter names.

    Returns
    -------
      A dict of model pararmeters derived from the hyper-parameters used to train this particular model.
    """
        idx = id if isinstance(id, int) else self.model_ids.index(id)
        model = self[idx]

        model_params = dict()

        # if cross-validation is turned on, parameters in one of the fold model actual contains the max_runtime_secs
        # parameter and not the main model that is returned.
        if model._is_xvalidated:
            model = h2o.get_model(model._xval_keys[0])

        for param_name in self.hyper_names:
            model_params[param_name] = model.params[param_name]['actual'][0] if \
              isinstance(model.params[param_name]['actual'], list) else model.params[param_name]['actual']

        if display:
            print('Hyperparameters: [' +
                  ', '.join(list(self.hyper_params.keys())) + ']')
        return model_params
コード例 #27
0
ファイル: grid_search.py プロジェクト: ntabgoba/h2o-3
    def get_hyperparams(self, id, display=True):
        """
    Get the hyperparameters of a model explored by grid search.
    
    Parameters
    ----------    
    id: str
      The model id of the model with hyperparameters of interest.
    display: boolean 
      Flag to indicate whether to display the hyperparameter names.
      
    Returns
    -------
      A list of the hyperparameters for the specified model.
    """
        idx = id if isinstance(id, int) else self.model_ids.index(id)
        model = self[idx]

        # if cross-validation is turned on, parameters in one of the fold model actuall contains the max_runtime_secs
        # parameter and not the main model that is returned.
        if model._is_xvalidated:
            model = h2o.get_model(model._xval_keys[0])

        res = [
            model.params[h]['actual'][0] if isinstance(
                model.params[h]['actual'], list) else model.params[h]['actual']
            for h in self.hyper_params
        ]
        if display:
            print('Hyperparameters: [' +
                  ', '.join(list(self.hyper_params.keys())) + ']')
        return res
コード例 #28
0
def test_stacked_ensembles_are_trained_with_blending_frame_even_if_nfolds_eq_0(
):
    print(
        "Check that we can disable cross-validation when passing a blending frame and that Stacked Ensembles are trained using this frame."
    )
    max_models = 5
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml_blending_frame",
                    seed=1,
                    max_models=max_models,
                    nfolds=0)
    aml.train(y=ds.target,
              training_frame=ds.train,
              blending_frame=ds.valid,
              leaderboard_frame=ds.test)

    se = get_partitioned_model_names(aml.leaderboard).se
    assert len(
        se
    ) > 3, "In blending mode, StackedEnsemble should still be trained in spite of nfolds=0."
    for m in se:
        model = h2o.get_model(m)
        assert model.params['blending_frame']['actual'][
            'name'] == ds.valid.frame_id
        assert model._model_json['output']['stacking_strategy'] == 'blending'
コード例 #29
0
def test_stackedensemble_propagates_the_max_runtime_secs():
    max_runtime_secs = 5
    hyper_parameters = dict()
    hyper_parameters["ntrees"] = [1, 3, 5]
    params = dict(
        fold_assignment="modulo",
        nfolds=3,
        keep_cross_validation_predictions=True
    )

    data = prepare_data()

    gs1 = H2OGridSearch(H2OGradientBoostingEstimator(**params), hyper_params=hyper_parameters)
    gs1.train(data.x, data.y, data.train, validation_frame=data.train)

    se = H2OStackedEnsembleEstimator(base_models=[gs1], max_runtime_secs=max_runtime_secs)
    se.train(data.x, data.y, data.train)
    metalearner = h2o.get_model(se.metalearner()["name"])

    # metalearner has the set max_runtine_secs
    assert metalearner.actual_params['max_runtime_secs'] <= max_runtime_secs
    assert metalearner.actual_params['max_runtime_secs'] > 0

    # stack ensemble has the set max_runtime_secs
    assert se.max_runtime_secs == max_runtime_secs
コード例 #30
0
ファイル: autoh2o.py プロジェクト: StevenLOL/h2o-3
def get_automl(project_name):
    """
    Retrieve information about an AutoML instance.

    :param str project_name:  A string indicating the project_name of the automl instance to retrieve.
    :returns: A dictionary containing the project_name, leader model, and leaderboard.
    """
    automl_json = h2o.api("GET /99/AutoML/%s" % project_name)
    project_name = automl_json["project_name"]
    leaderboard_list = [key["name"] for key in automl_json['leaderboard']['models']]

    if leaderboard_list is not None and len(leaderboard_list) > 0:
        leader_id = leaderboard_list[0]
    else:
        leader_id = None

    leader = h2o.get_model(leader_id)
    # Intentionally mask the progress bar here since showing multiple progress bars is confusing to users.
    # If any failure happens, revert back to user's original setting for progress and display the error message.
    is_progress = H2OJob.__PROGRESS_BAR__
    h2o.no_progress()
    try:
        # Parse leaderboard H2OTwoDimTable & return as an H2OFrame
        leaderboard = h2o.H2OFrame(
            automl_json["leaderboard_table"].cell_values,
            column_names=automl_json["leaderboard_table"].col_header)
    except Exception as ex:
        raise ex
    finally:
        if is_progress is True:
            h2o.show_progress()

    leaderboard = leaderboard[1:]
    automl_dict = {'project_name': project_name, "leader": leader, "leaderboard": leaderboard}
    return automl_dict
def test_api_timestamp():
    prostate_train = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv"))

    prostate_train["CAPSULE"] = prostate_train["CAPSULE"].asfactor()

    ntrees = 1
    learning_rate = 0.1
    depth = 5
    min_rows = 10

    # Build H2O GBM classification model:
    gbm_h2o = H2OGradientBoostingEstimator(ntrees=ntrees, learn_rate=learning_rate,
                                           max_depth=depth,
                                           min_rows=min_rows,
                                           distribution="bernoulli",
                                           model_id="test_timestamp")
    gbm_h2o.train(x=list(range(1,prostate_train.ncol)),y="CAPSULE", training_frame=prostate_train)

    model = h2o.get_model(model_id="test_timestamp")
    models = h2o.api("GET /3/Models")
    assert model._model_json['timestamp'] == models["models"][0]["timestamp"], "Timestamp should be the same."

    assert gbm_h2o.start_time is not None and gbm_h2o.start_time > 0
    assert gbm_h2o.end_time is not None and gbm_h2o.end_time > 0
    assert gbm_h2o.run_time is not None and gbm_h2o.run_time > 0

    assert gbm_h2o.end_time - gbm_h2o.start_time == gbm_h2o.run_time
コード例 #32
0
def test_maxrglm_gaussian_coefs():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    maxrglm_model = maxrglm(seed=12345, max_predictor_number=7)
    maxrglm_model.train(training_frame=d, x=my_x, y=my_y)
    coefs = maxrglm_model.coef()
    coefs_norm = maxrglm_model.coef_norm()
    for ind in list(range(len(coefs))):
        one_coef = coefs[ind]
        one_coef_norm = coefs_norm[ind]
        # coefficients obtained from accessing model_id, generate model and access the model coeffs
        one_model = h2o.get_model(
            maxrglm_model._model_json["output"]["best_model_ids"][ind]['name'])
        model_coef = one_model.coef()
        model_coef_norm = one_model.coef_norm()
        # get coefficients of individual predictor subset size
        subset_size = ind + 1
        one_model_coef = maxrglm_model.coef(subset_size)
        one_model_coef_norm = maxrglm_model.coef_norm(subset_size)

        # check coefficient dicts are equal
        pyunit_utils.assertCoefDictEqual(one_coef, model_coef, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_coef_norm, model_coef_norm, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef, model_coef, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef_norm, model_coef_norm,
                                         1e-6)
コード例 #33
0
ファイル: grid_search.py プロジェクト: madmax983/h2o-3
  def get_grid(model, hyper_params, grid_id):
    """
    Retrieve an H2OGridSearch instance already trained given its original model, hyper_params, and grid_id. 
    
    Parameters
    ----------    
    model : H2O Estimator model
      The type of model explored that is initalized with optional parameters which are unchanged across explored models.
    hyper_params: dict
      A dictionary of string parameters (keys) and a list of values explored by grid search (values).
    grid_id : str, optional
      The unique id assigned to the grid object.
     
    Returns
    -------
      A new H2OGridSearch instance that is a replica of the H2OGridSearch instance with the specified grid_id.

    """
    kwargs = {'_rest_version':99}
    grid_json = H2OConnection.get_json("Grids/"+grid_id, **kwargs)
    grid = H2OGridSearch(model, hyper_params, grid_id)
    grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]
    first_model_json = H2OConnection.get_json("Models/"+grid_json['model_ids'][0]['name'], _rest_version=kwargs['_rest_version'])['models'][0]
    model_class = H2OGridSearch._metrics_class(first_model_json)
    m = model_class()
    m._id = grid_id
    m._grid_json = grid_json
    # m._metrics_class = metrics_class
    m._parms = grid._parms
    H2OEstimator.mixin(grid,model_class)
    grid.__dict__.update(m.__dict__.copy())
    return grid
コード例 #34
0
ファイル: grid_search.py プロジェクト: michalkurka/h2o-3
    def get_grid(self, sort_by=None, decreasing=None):
        """
        Retrieve an H2OGridSearch instance.

        Optionally specify a metric by which to sort models and a sort order.
        Note that if neither cross-validation nor a validation frame is used in the grid search, then the
        training metrics will display in the "get grid" output. If a validation frame is passed to the grid, and
        ``nfolds = 0``, then the validation metrics will display. However, if ``nfolds`` > 1, then cross-validation
        metrics will display even if a validation frame is provided.

        :param str sort_by: A metric by which to sort the models in the grid space. Choices are: ``"logloss"``,
            ``"residual_deviance"``, ``"mse"``, ``"auc"``, ``"r2"``, ``"accuracy"``, ``"precision"``, ``"recall"``,
            ``"f1"``, etc.
        :param bool decreasing: Sort the models in decreasing order of metric if true, otherwise sort in increasing
            order (default).

        :returns: A new H2OGridSearch instance optionally sorted on the specified metric.
        """
        if sort_by is None and decreasing is None: return self

        grid_json = h2o.api("GET /99/Grids/%s" % self._id, data={"sort_by": sort_by, "decreasing": decreasing})
        grid = H2OGridSearch(self.model, self.hyper_params, self._id)
        grid.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]  # reordered
        first_model_json = h2o.api("GET /99/Models/%s" % grid_json['model_ids'][0]['name'])['models'][0]
        model_class = H2OGridSearch._metrics_class(first_model_json)
        m = model_class()
        m._id = self._id
        m._grid_json = grid_json
        # m._metrics_class = metrics_class
        m._parms = grid._parms
        H2OEstimator.mixin(grid, model_class)
        grid.__dict__.update(m.__dict__.copy())
        return grid
コード例 #35
0
ファイル: grid_search.py プロジェクト: shyamkg/h2o-3
    def get_hyperparams(self, id, display=True):
        """
        Get the hyperparameters of a model explored by grid search.

        Parameters
        ----------
        id: str
          The model id of the model with hyperparameters of interest.
        display: boolean
          Flag to indicate whether to display the hyperparameter names.

        Returns
        -------
          A list of the hyperparameters for the specified model.
        """
        idx = id if is_int(id) else self.model_ids.index(id)
        model = self[idx]

        # if cross-validation is turned on, parameters in one of the fold model actuall contains the max_runtime_secs
        # parameter and not the main model that is returned.
        if model._is_xvalidated:
            model = h2o.get_model(model._xval_keys[0])

        res = [model.params[h]['actual'][0] if isinstance(model.params[h]['actual'], list)
               else model.params[h]['actual']
               for h in self.hyper_params]
        if display: print('Hyperparameters: [' + ', '.join(list(self.hyper_params.keys())) + ']')
        return res
コード例 #36
0
def test_maxrglm_gaussian():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    maxrglm_model = maxrglm(seed=12345, max_predictor_number=7)
    maxrglm_model.train(training_frame=d, x=my_x, y=my_y)
    resultFrame = maxrglm_model.result()
    numRows = resultFrame.nrows
    best_r2_value = maxrglm_model.get_best_R2_values()
    for ind in list(range(numRows)):
        # r2 from attributes
        best_r2 = best_r2_value[ind]
        one_model = h2o.get_model(resultFrame["model_id"][ind, 0])
        pred = one_model.predict(d)
        print("last element of predictor frame: {0}".format(
            pred[pred.nrows - 1, pred.ncols - 1]))
        assert pred.nrows == d.nrows, "expected dataset row: {0}, actual dataset row: {1}".format(
            pred.nrows, d.nrows)
        # r2 from result frame
        frame_r2 = resultFrame["best_r2_value"][ind, 0]
        # r2 from model
        model_r2 = one_model.r2()
        # make sure all r2 are equal
        assert abs(
            best_r2 - frame_r2
        ) < 1e-6, "expected best r2: {0}, actual best r2: {1}".format(
            best_r2, frame_r2)
        assert abs(
            frame_r2 - model_r2
        ) < 1e-6, "expected best r2: {0}, actual best r2: {1}".format(
            model_r2, frame_r2)
コード例 #37
0
def benign_grid():
  training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))

  Y = 3
  X = list(range(3)) + list(range(4,11))

  # NOTE: this tests bad parameter value handling; 'a' is not a float:
  hyper_parameters = {'alpha': [0.01,0.5,'a'], 'lambda': [1e-5,1e-6]}
  gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters)
  gs.train(x=X,y=Y, training_frame=training_data)
  for model in gs:
    assert isinstance(model, H2OGeneralizedLinearEstimator)
  gs.show()
  print(gs.sort_by('F1', False))
  best_model_id = gs.sort_by('F1', False)['Model Id'][0]
  best_model = h2o.get_model(best_model_id)
  best_model.predict(training_data)
  gs.predict(training_data)
  print(gs.get_hyperparams(best_model_id))
  print(gs.grid_id)

  assert best_model.params['family']['actual'] == 'binomial'

  # test search_criteria plumbing
  search_criteria = { 'strategy': "RandomDiscrete", 'max_models': 3 }
  max_models_g = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, search_criteria=search_criteria)
  max_models_g.train(x=X,y=Y, training_frame=training_data)

  max_models_g.show()
  print(max_models_g.grid_id)
  print(max_models_g.sort_by('F1', False))

  assert len(max_models_g.models) == 3, "expected 3 models, got: {}".format(len(max_models_g.models))
  print(max_models_g.sorted_metric_table())
  print(max_models_g.get_grid("r2"))
コード例 #38
0
def benign_grid():
  training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))

  Y = 3
  X = range(3) + range(4,11)


  hyper_parameters = {'alpha': [0.01,0.5,'a'], 'lambda': [1e-5,1e-6]}
  gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters)
  gs.train(x=X,y=Y, training_frame=training_data)
  gs.show()
  print gs.sort_by('F1', False)
  best_model_id = gs.sort_by('F1', False)['Model Id'][0]
  best_model = h2o.get_model(best_model_id)
  best_model.predict(training_data)
  gs.predict(training_data)
  print gs.get_hyperparams(best_model_id)
  print gs.grid_id
  
  new_g = H2OGridSearch.get_grid(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, gs.grid_id)
  new_g.show()
  print new_g.grid_id
  print new_g.sort_by('F1', False)

  assert best_model.params['family']['actual'] == 'binomial'
コード例 #39
0
def covtype_get_model(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    #Log.info("Importing covtype.20k.data...\n")
    covtype = h2o.import_frame(
        path=h2o.locate("smalldata/covtype/covtype.20k.data"))

    Y = 54
    X = range(0, 20) + range(29, 54)

    # Set response to be indicator of a particular class
    res_class = random.randint(1, 4)
    # Log.info(paste("Setting response column", myY, "to be indicator of class", res_class, "\n"))
    covtype[54] = (covtype[54] == res_class)

    #covtype_data.summary()

    # L2: alpha = 0, lambda = 0
    covtype_mod1 = h2o.glm(y=covtype[Y],
                           x=covtype[X],
                           family="binomial",
                           alpha=[0],
                           Lambda=[0])
    covtype_mod1.show()
    covtype_mod1 = h2o.get_model(covtype_mod1._id)
    covtype_mod1.show()

    # Elastic: alpha = 0.5, lambda = 1e-4
    covtype_mod2 = h2o.glm(y=covtype[Y],
                           x=covtype[X],
                           family="binomial",
                           alpha=[0.5],
                           Lambda=[1e-4])
    covtype_mod2.show()
    covtype_mod2 = h2o.get_model(covtype_mod2._id)
    covtype_mod2.show()

    # L1: alpha = 1, lambda = 1e-4
    covtype_mod3 = h2o.glm(y=covtype[Y],
                           x=covtype[X],
                           family="binomial",
                           alpha=[1],
                           Lambda=[1e-4])
    covtype_mod3.show()
    covtype_mod3 = h2o.get_model(covtype_mod3._id)
    covtype_mod3.show()
def test_gaussian_result_frame_model_id():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    maxr_model = modelSelection(seed=12345,
                                max_predictor_number=7,
                                mode="maxr")
    maxr_model.train(training_frame=d, x=my_x, y=my_y)
    allsubsets_model = modelSelection(seed=12345,
                                      max_predictor_number=7,
                                      mode="allsubsets")
    allsubsets_model.train(training_frame=d, x=my_x, y=my_y)
    result_frame_allsubsets = allsubsets_model.result()
    numRows = result_frame_allsubsets.nrows
    best_r2_allsubsets = allsubsets_model.get_best_R2_values()
    result_frame_maxr = maxr_model.result()
    best_r2_maxr = maxr_model.get_best_R2_values()
    for ind in list(range(numRows)):
        # r2 from attributes
        best_r2_value_allsubsets = best_r2_allsubsets[ind]
        one_model_allsubsets = h2o.get_model(
            result_frame_allsubsets["model_id"][ind, 0])
        pred_allsubsets = one_model_allsubsets.predict(d)
        print("last element of predictor frame: {0}".format(
            pred_allsubsets[pred_allsubsets.nrows - 1,
                            pred_allsubsets.ncols - 1]))
        assert pred_allsubsets.nrows == d.nrows, "expected dataset row: {0}, actual dataset row: " \
                                                 "{1}".format(pred_allsubsets.nrows, d.nrows)
        best_r2_value_maxr = best_r2_maxr[ind]
        one_model_maxr = h2o.get_model(result_frame_maxr["model_id"][ind, 0])
        pred_maxr = one_model_maxr.predict(d)
        pyunit_utils.compare_frames_local(
            pred_maxr, pred_allsubsets, prob=1,
            tol=1e-6)  # compare allsubsets and maxr results
        # r2 from result frame
        frame_r2_allsubsets = result_frame_allsubsets["best_r2_value"][ind, 0]
        # r2 from model
        model_r2_allsubsets = one_model_allsubsets.r2()
        # make sure all r2 are equal
        assert abs(best_r2_value_allsubsets-frame_r2_allsubsets) < 1e-6, "expected best r2: {0}, actual best r2: " \
                                                                   "{1}".format(best_r2_value_allsubsets, frame_r2_allsubsets)
        assert abs(frame_r2_allsubsets-model_r2_allsubsets) < 1e-6, "expected best r2: {0}, actual best r2: " \
                                                                    "{1}".format(model_r2_allsubsets, frame_r2_allsubsets)
        assert abs(best_r2_value_maxr-model_r2_allsubsets) < 1e-6, "expected best r2: {0}, maxr best r2: {1}" \
                                                             "".format(best_r2_value_maxr, model_r2_allsubsets)
コード例 #41
0
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_type(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not is_type(x, list, tuple): x = [x]
        if is_type(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if
                       kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  # unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
        rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None

        grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()

        grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key))
        failure_messages_stacks = ""
        error_index = 0
        if len(grid_json["failure_details"]) > 0:
            print("Errors/Warnings building gridsearch model\n")
# will raise error if no grid model is returned, store error messages here

            for error_message in grid_json["failure_details"]:
                if isinstance(grid_json["failed_params"][error_index], dict):
                    for h_name in grid_json['hyper_names']:
                        print("Hyper-parameter: {0}, {1}".format(h_name,
                                                                 grid_json['failed_params'][error_index][h_name]))

                if len(grid_json["failure_stack_traces"]) > error_index:
                    print("failure_details: {0}\nfailure_stack_traces: "
                          "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index]))
                    failure_messages_stacks += error_message+'\n'
                error_index += 1

        self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]

        # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        # sometimes no model is returned due to bad parameter values provided by the user.
        if len(grid_json['model_ids']) > 0:
            first_model_json = h2o.api("GET /%d/Models/%s" %
                                       (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0]
            self._resolve_grid(grid.dest_key, grid_json, first_model_json)
        else:
            if len(failure_messages_stacks)>0:
                raise ValueError(failure_messages_stacks)
            else:
                raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
コード例 #42
0
ファイル: grid_search.py プロジェクト: StevenLOL/h2o-3
    def _model_build(self, x, y, tframe, vframe, kwargs):
        kwargs['training_frame'] = tframe
        if vframe is not None: kwargs["validation_frame"] = vframe
        if is_type(y, int): y = tframe.names[y]
        if y is not None: kwargs['response_column'] = y
        if not is_type(x, list, tuple): x = [x]
        if is_type(x[0], int):
            x = [tframe.names[i] for i in x]
        offset = kwargs["offset_column"]
        folds = kwargs["fold_column"]
        weights = kwargs["weights_column"]
        ignored_columns = list(set(tframe.names) - set(x + [y, offset, folds, weights]))
        kwargs["ignored_columns"] = None if not ignored_columns else [quoted(col) for col in ignored_columns]
        kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if
                       kwargs[k] is not None])  # gruesome one-liner
        algo = self.model._compute_algo()  # unique to grid search
        if self.grid_id is not None: kwargs["grid_id"] = self.grid_id
        rest_ver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else None

        grid = H2OJob(h2o.api("POST /99/Grid/%s" % algo, data=kwargs), job_type=(algo + " Grid Build"))

        if self._future:
            self._job = grid
            return

        grid.poll()

        grid_json = h2o.api("GET /99/Grids/%s" % (grid.dest_key))
        failure_messages_stacks = ""
        error_index = 0
        if len(grid_json["failure_details"]) > 0:
            print("Errors/Warnings building gridsearch model\n")
# will raise error if no grid model is returned, store error messages here

            for error_message in grid_json["failure_details"]:
                if isinstance(grid_json["failed_params"][error_index], dict):
                    for h_name in grid_json['hyper_names']:
                        print("Hyper-parameter: {0}, {1}".format(h_name,
                                                                 grid_json['failed_params'][error_index][h_name]))

                if len(grid_json["failure_stack_traces"]) > error_index:
                    print("failure_details: {0}\nfailure_stack_traces: "
                          "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index]))
                    failure_messages_stacks += error_message+'\n'
                error_index += 1

        self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]

        # get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
        # sometimes no model is returned due to bad parameter values provided by the user.
        if len(grid_json['model_ids']) > 0:
            first_model_json = h2o.api("GET /%d/Models/%s" %
                                       (rest_ver or 3, grid_json['model_ids'][0]['name']))['models'][0]
            self._resolve_grid(grid.dest_key, grid_json, first_model_json)
        else:
            if len(failure_messages_stacks)>0:
                raise ValueError(failure_messages_stacks)
            else:
                raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
コード例 #43
0
def rename_things():
    fr = h2o.import_file(tests.locate("smalldata/logreg/prostate.csv"))
    fr.frame_id = "mooochooo"
    print h2o.ls()
    zz = fr[1:2]
    zz.show()
    zz.frame_id = "black_sheep_LLC"
    print h2o.ls()
    from h2o.estimators.gbm import H2OGradientBoostingEstimator

    m = H2OGradientBoostingEstimator(ntrees=5, max_depth=2)
    m.train(x=fr.names[2:], y=fr.names[1], training_frame=fr)
    print m.model_id
    m.model_id = "my_gbm_model_wwwww"
    print h2o.ls()
    print h2o.get_model("my_gbm_model_wwwww")
    print h2o.ls()
コード例 #44
0
def get_model_by_algo(algo,models_dict):
    mod=None
    mod_id=None    
    for m in list(models_dict.keys()):
        if m[0:3]==algo:
            mod_id=m
            mod=h2o.get_model(m)      
    return mod,mod_id     
コード例 #45
0
def stackedensemble(mod):
    coef_norm=None
    try:
      metalearner = h2o.get_model(mod.metalearner()['name'])
      coef_norm=metalearner.coef_norm()
    except:
      pass        
    return coef_norm
コード例 #46
0
def test_modelselection_gaussian_model_id():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    allsubsets_model = modelSelection(seed=12345,
                                      max_predictor_number=7,
                                      mode="allsubsets")
    allsubsets_model.train(training_frame=d, x=my_x, y=my_y)
    result_frame_allsubsets = allsubsets_model.result()
    numRows = result_frame_allsubsets.nrows
    modelIDs_allsubsets = allsubsets_model._model_json["output"][
        "best_model_ids"]
    maxr_model = modelSelection(seed=12345,
                                max_predictor_number=7,
                                mode="maxr")
    maxr_model.train(training_frame=d, x=my_x, y=my_y)
    result_frame_maxr = maxr_model.result()
    maxrsweep_model = modelSelection(seed=12345,
                                     max_predictor_number=7,
                                     mode="maxrsweep")
    maxrsweep_model.train(training_frame=d, x=my_x, y=my_y)

    # make sure results returned by maxr and maxrsweep are the same
    pyunit_utils.compare_frames_local(maxr_model.result()[2:4],
                                      maxrsweep_model.result()[2:4],
                                      prob=1.0,
                                      tol=1e-6)

    for ind in list(range(numRows)):
        model_from_frame_allsubsets = h2o.get_model(
            result_frame_allsubsets["model_id"][ind, 0])
        pred_frame_allsubsets = model_from_frame_allsubsets.predict(d)
        model_from_frame_allsubsets = h2o.get_model(
            modelIDs_allsubsets[ind]['name'])
        pred_id_allsubsets = model_from_frame_allsubsets.predict(d)
        pyunit_utils.compare_frames_local(pred_frame_allsubsets,
                                          pred_id_allsubsets,
                                          prob=1)
        model_from_frame_maxr = h2o.get_model(
            result_frame_maxr["model_id"][ind, 0])
        pred_frame_maxr = model_from_frame_maxr.predict(d)
        pyunit_utils.compare_frames_local(pred_frame_allsubsets,
                                          pred_frame_maxr,
                                          prob=1,
                                          tol=1e-6)
コード例 #47
0
def h2o_print_leaderboard(lb_frame, top_n=999999):
    df = lb_frame.as_data_frame()
    for i in range(0, min(top_n, df.shape[0])):
        model_id = df['model_id'][i]
        print(df[i:i + 1].to_string(index=False))
        best_model = h2o.get_model(model_id)
        pprint(h2o_not_default_params_str(best_model))
        print()
コード例 #48
0
def test_nfolds_eq_0():
    print("Check nfolds = 0 works properly")
    ds = import_dataset()
    aml = H2OAutoML(project_name="py_aml_nfolds0", nfolds=0, max_models=3, seed=1)
    aml.train(y=ds['target'], training_frame=ds['train'])
    _, non_se, _ = get_partitioned_model_names(aml.leaderboard)
    amodel = h2o.get_model(non_se[0])
    assert amodel.params['nfolds']['actual'] == 0
コード例 #49
0
def check_ignore_cols_automl(models,names,x,y):
    models = sum(models.as_data_frame().values.tolist(),[])
    for model in models:
        if "StackedEnsemble" in model:
            continue
        else:
            assert set(h2o.get_model(model).params["ignored_columns"]["actual"]) == set(names) - {y} - set(x), \
                "ignored columns are not honored for model " + model
コード例 #50
0
 def test_param_enabled():
     print("\n=== enabling "+kcvm+" ===")
     aml = setup_and_train(True)
     models, non_se, se = get_partitioned_model_names(aml.leaderboard)
     check_model_property(se, kcvm, False)
     check_model_property(non_se, kcvm, True, True, False)
     keys = list_keys_in_memory()
     tot, cv = len(keys['models']), len(keys['cv_models'])
     print("total models in memory = {tot}, among which {cv} CV models".format(tot=tot, cv=cv))
     assert tot > 0, "no models left in memory"
     expected = len(models) * nfolds
     assert cv == expected, "missing CV models in memory, got {actual}, expected {expected}".format(actual=cv, expected=expected)
     for m in non_se:
         assert h2o.get_model(m).cross_validation_models(), "missing cv models for model "+m
     for m in se:
         metal = h2o.get_model(h2o.get_model(m).metalearner()['name'])
         assert metal.cross_validation_models(), "missing cv models for metalearner of model "+m
コード例 #51
0
 def test_param_enabled():
     print("\n=== enabling "+kcvm+" ===")
     aml = setup_and_train(True)
     models, non_se, se = get_partitioned_model_names(aml.leaderboard)
     check_model_property(se, kcvm, False)
     check_model_property(non_se, kcvm, True, True, True)
     keys = list_keys_in_memory()
     tot, cv = len(keys['models']), len(keys['cv_models'])
     print("total models in memory = {tot}, among which {cv} CV models".format(tot=tot, cv=cv))
     assert tot > 0, "no models left in memory"
     expected = len(models) * nfolds
     assert cv == expected, "missing CV models in memory, got {actual}, expected {expected}".format(actual=cv, expected=expected)
     for m in non_se:
         assert h2o.get_model(m).cross_validation_models(), "missing cv models for model "+m
     for m in se:
         metal = h2o.get_model(h2o.get_model(m).metalearner()['name'])
         assert metal.cross_validation_models(), "missing cv models for metalearner of model "+m
コード例 #52
0
ファイル: pyunit_explain.py プロジェクト: wwjiang007/h2o-3
def test_explanation_list_of_models_binomial_classification():
    train = h2o.upload_file(
        pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    y = "CAPSULE"
    train[y] = train[y].asfactor()
    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    models = [
        h2o.get_model(m[0])
        for m in aml.leaderboard["model_id"].as_data_frame(use_pandas=False,
                                                           header=False)
    ]

    # Test named models as well
    gbm = H2OGradientBoostingEstimator(model_id="my_awesome_model")
    gbm.train(y=y, training_frame=train)
    models += [gbm]

    # test variable importance heatmap plot
    assert isinstance(
        h2o.varimp_heatmap(models).figure(), matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test model correlation heatmap plot
    assert isinstance(
        h2o.model_correlation_heatmap(models, train).figure(),
        matplotlib.pyplot.Figure)
    matplotlib.pyplot.close()

    # test partial dependences
    for col in cols_to_test:
        assert isinstance(
            h2o.pd_multi_plot(models, train, col).figure(),
            matplotlib.pyplot.Figure)
        matplotlib.pyplot.close()

    # test learning curve
    for model in models:
        assert isinstance(model.learning_curve_plot().figure(),
                          matplotlib.pyplot.Figure)
    matplotlib.pyplot.close("all")

    # test explain
    assert isinstance(h2o.explain(models, train, render=False), H2OExplanation)

    # test explain row
    assert isinstance(h2o.explain_row(models, train, 1, render=False),
                      H2OExplanation)
コード例 #53
0
def iris_get_model():

    iris = h2o.import_file(path=tests.locate("smalldata/iris/iris.csv"))

    model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50)
    model.show()

    model = h2o.get_model(model._id)
    model.show()
コード例 #54
0
def iris_get_model():

    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))

    model = H2ORandomForestEstimator(ntrees=50)
    model.train(y=4, x=list(range(4)), training_frame=iris)
    model.show()

    model = h2o.get_model(model._id)
    model.show()
コード例 #55
0
ファイル: model_base.py プロジェクト: BalamuraliN/h2o-3
 def cross_validation_models(self):
   """
   Obtain a list of cross-validation models.
   :return: list of H2OModel objects
   """
   cvmodels = self._model_json["output"]["cross_validation_models"]
   if cvmodels is None: return None
   m = []
   for p in cvmodels: m.append(h2o.get_model(p["name"]))
   return m
コード例 #56
0
def iris_get_model(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris.csv"))

    model = h2o.random_forest(y=iris[4], x=iris[0:4], ntrees=50)
    model.show()

    model = h2o.get_model(model._id)
    model.show()
コード例 #57
0
ファイル: grid_search.py プロジェクト: Avighan/h2o-3
  def _model_build(self, x, y, tframe, vframe, kwargs):
    kwargs['training_frame'] = tframe
    if vframe is not None: kwargs["validation_frame"] = vframe
    if isinstance(y, int): y = tframe.names[y]
    if y is not None: kwargs['response_column'] = y
    if not isinstance(x, (list,tuple)): x=[x]
    if isinstance(x[0], int):
      x = [tframe.names[i] for i in x]
    offset = kwargs["offset_column"]
    folds  = kwargs["fold_column"]
    weights= kwargs["weights_column"]
    ignored_columns = list(set(tframe.names) - set(x + [y,offset,folds,weights]))
    kwargs["ignored_columns"] = None if ignored_columns==[] else [h2o.h2o._quoted(col) for col in ignored_columns]
    kwargs = dict([(k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None])  # gruesome one-liner
    algo = self.model._compute_algo()  #unique to grid search
    kwargs["_rest_version"] = 99  #unique to grid search
    if self.grid_id is not None: kwargs["grid_id"] = self.grid_id 

    grid = H2OJob(H2OConnection.post_json("Grid/"+algo, **kwargs), job_type=(algo+" Grid Build"))

    if self._future:
      self._job = grid
      return

    grid.poll()
    if '_rest_version' in list(kwargs.keys()):
      grid_json = H2OConnection.get_json("Grids/"+grid.dest_key, _rest_version=kwargs['_rest_version'])

      error_index = 0
      if len(grid_json["failure_details"]) > 0:
        print("Errors/Warnings building gridsearch model\n")

        for error_message in grid_json["failure_details"]:
          if isinstance(grid_json["failed_params"][error_index], dict):
            for h_name in grid_json['hyper_names']:
              print("Hyper-parameter: {0}, {1}".format(h_name, grid_json['failed_params'][error_index][h_name]))

          if len(grid_json["failure_stack_traces"]) > error_index:
            print("failure_details: {0}\nfailure_stack_traces: "
                  "{1}\n".format(error_message, grid_json['failure_stack_traces'][error_index]))
          error_index += 1
    else:
      grid_json = H2OConnection.get_json("Grids/"+grid.dest_key)

    self.models = [h2o.get_model(key['name']) for key in grid_json['model_ids']]

    #get first model returned in list of models from grid search to get model class (binomial, multinomial, etc)
    # sometimes no model is returned due to bad parameter values provided by the user.
    if len(grid_json['model_ids']) > 0:
      first_model_json = H2OConnection.get_json("Models/"+grid_json['model_ids'][0]['name'],
                                                _rest_version=kwargs['_rest_version'])['models'][0]
      self._resolve_grid(grid.dest_key, grid_json, first_model_json)
    else:
      raise ValueError("Gridsearch returns no model due to bad parameter values or other reasons....")
コード例 #58
0
def get_modelGBM(ip, port):

    prostate = h2o.import_file(path=h2o.locate("smalldata/logreg/prostate.csv"))
    prostate.describe()
    prostate[1] = prostate[1].asfactor()
    prostate_gbm = h2o.gbm(y=prostate[1], x=prostate[2:9], distribution="bernoulli")
    prostate_gbm.show()

    prostate_gbm.predict(prostate)
    model = h2o.get_model(prostate_gbm._id)
    model.show()
コード例 #59
0
def get_model_gbm():
  prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
  prostate.describe()
  prostate[1] = prostate[1].asfactor()

  prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli")
  prostate_gbm.train(x=range(2,9),y=1, training_frame=prostate)
  prostate_gbm.show()

  prostate_gbm.predict(prostate)
  model = h2o.get_model(prostate_gbm.model_id)
  model.show()
コード例 #60
0
def check_model_property(model_names, prop_name, present=True, actual_value=None, default_value=None):
    for mn in model_names:
        model = h2o.get_model(mn)
        if present:
            assert prop_name in model.params.keys(), \
                "missing {prop} in model {model}".format(prop=prop_name, model=mn)
            assert actual_value is None or model.params[prop_name]['actual'] == actual_value, \
                "actual value for {prop} in model {model} is {val}, expected {exp}".format(prop=prop_name, model=mn, val=model.params[prop_name]['actual'], exp=actual_value)
            assert default_value is None or model.params[prop_name]['default'] == default_value, \
                "default value for {prop} in model {model} is {val}, expected {exp}".format(prop=prop_name, model=mn, val=model.params[prop_name]['default'], exp=default_value)
        else:
            assert prop_name not in model.params.keys(), "unexpected {prop} in model {model}".format(prop=prop_name, model=mn)