Python H2OGeneralizedLinearEstimator Examples, h2o.estimators.glm.H2OGeneralizedLinearEstimator Python Examples

Example #1

0

Show file

File: pyunit_glm_regularization_path.py Project: Jason0812/h2o-3

def reg_path_glm():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    m = glm(family='binomial',lambda_search=True,solver='COORDINATE_DESCENT')
    m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m,coefs=r['coefficients'][10])
    dev1 = r['explained_deviance_train'][10]
    p = m2.model_performance(d)
    dev2 = 1-p.residual_deviance()/p.null_deviance()
    assert abs(dev1 - dev2) < 1e-6
    for l in range(0,len(r['lambdas'])):
        m = glm(family='binomial',lambda_search=False,Lambda=r['lambdas'][l],solver='COORDINATE_DESCENT')
        m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        diff = 0
        diff2 = 0
        for n in cs.keys():
            diff = max(diff,abs((cs[n] - m.coef()[n])))
            diff2 = max(diff2,abs((cs_norm[n] - m.coef_norm()[n])))
        print(diff)
        print(diff2)
        assert diff < 1e-2
        assert diff2 < 1e-2
        p = m.model_performance(d)
        devm = 1-p.residual_deviance()/p.null_deviance()
        devn = r['explained_deviance_train'][l]
        print(devm)
        print(devn)
        assert abs(devm - devn) < 1e-4

Example #2

0

Show file

File: pyunit_glm_regularization_path.py Project: yifanxie/h2o-3

def reg_path_glm():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    m = glm(family='binomial', lambda_search=True, solver='COORDINATE_DESCENT')
    m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][10])
    dev1 = r['explained_deviance_train'][10]
    p = m2.model_performance(d)
    dev2 = 1 - p.residual_deviance() / p.null_deviance()
    assert abs(dev1 - dev2) < 1e-6
    for l in range(0, len(r['lambdas'])):
        m = glm(family='binomial',
                lambda_search=False,
                Lambda=r['lambdas'][l],
                solver='COORDINATE_DESCENT')
        m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        diff = 0
        diff2 = 0
        for n in cs.keys():
            diff = max(diff, abs((cs[n] - m.coef()[n])))
            diff2 = max(diff2, abs((cs_norm[n] - m.coef_norm()[n])))
        print(diff)
        print(diff2)
        assert diff < 1e-2
        assert diff2 < 1e-2
        p = m.model_performance(d)
        devm = 1 - p.residual_deviance() / p.null_deviance()
        devn = r['explained_deviance_train'][l]
        print(devm)
        print(devn)
        assert abs(devm - devn) < 1e-4

Example #3

0

Show file

def shuffling_large():
  print("Reading in Arcene training data for binomial modeling.")
  train_data = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene.csv"))
  train_data_shuffled = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv"))


  print("Create model on original Arcene dataset.")
  h2o_model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5)
  h2o_model.train(x=list(range(1000)), y=1000, training_frame=train_data)

  print("Create second model on original Arcene dataset.")
  h2o_model_2 = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5)
  h2o_model_2.train(x=list(range(1000)), y=1000, training_frame=train_data)

  print("Create model on shuffled Arcene dataset.")
  h2o_model_s = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5)
  h2o_model_s.train(x=list(range(1000)), y=1000, training_frame=train_data_shuffled)

  print("Assert that number of predictors remaining and their respective coefficients are equal.")

  for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2.
          _model_json['output']['coefficients_table'].cell_values):
    assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
    if isinstance(x[1],float):
      assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
    if isinstance(x[2],float):
      assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"

  for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s.
          _model_json['output']['coefficients_table'].cell_values):
    assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type"
    if isinstance(x[1],float):
      assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal"
    if isinstance(x[2],float):
      assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"

Example #4

0

Show file

File: pyunit_PUBDEV_8683_gamma_dispersion_maxIter.py Project: timgates42/h2o-3

def test_max_iterations_dispersion():
    training_data = h2o.import_file(
        "http://h2o-public-test-data.s3.amazonaws.com/smalldata/glm_test/gamma_dispersion_factor_9_10kRows.csv"
    )
    Y = 'resp'
    x = ['abs.C1.', 'abs.C2.', 'abs.C3.', 'abs.C4.', 'abs.C5.']

    model_short = H2OGeneralizedLinearEstimator(family='gamma',
                                                lambda_=0,
                                                compute_p_values=True,
                                                dispersion_factor_method="ml",
                                                max_iterations_dispersion=1)
    model_short.train(training_frame=training_data, x=x, y=Y)

    model_long = H2OGeneralizedLinearEstimator(
        family='gamma',
        lambda_=0,
        compute_p_values=True,
        dispersion_factor_method="ml",
        max_iterations_dispersion=1000000)
    model_long.train(training_frame=training_data, x=x, y=Y)
    true_dispersion = 9
    # check model with more iterations should generate dispersion parameters closer to the true dispersion value
    assert abs(model_short._model_json["output"]["dispersion"]-true_dispersion) > \
           abs(model_long._model_json["output"]["dispersion"]-true_dispersion), \
        " Model with more iterations should generate better dispersion parameter estimate but did not."

Example #5

0

Show file

File: pyunit_glm_iteraction_MOJO_fail.py Project: zoudongyang/h2o-3

def interactions_airlines():
    airlines = h2o.import_file(
        pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip"))
    interaction_pairs = [("CRSDepTime", "UniqueCarrier"),
                         ("CRSDepTime", "Origin"), ("UniqueCarrier", "Origin")]
    y = 'IsDepDelayed'
    model = H2OGeneralizedLinearEstimator(
        family="Binomial",
        interaction_pairs=interaction_pairs,
    )
    model.train(y=y, training_frame=airlines)
    MOJONAME = pyunit_utils.getMojoName(model._id)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", MOJONAME))
    os.mkdir(TMPDIR)
    try:
        model.download_mojo(path=TMPDIR)
        assert False, "Download MOJO should fail."
    except H2OValueError as e:
        assert "Export to MOJO not supported" in e.args[0]
    try:
        model.download_pojo(path=TMPDIR)
        assert False, "Download POJO should fail."
    except H2OValueError as e:
        assert 'Export to POJO not supported' in e.args[0]
    # should work without interaction pairs
    model = H2OGeneralizedLinearEstimator(family="Binomial")
    model.train(y=y, training_frame=airlines)
    model.download_mojo(path=TMPDIR)
    model.download_pojo(path=TMPDIR)

Example #6

0

Show file

File: pyunit_PUBDEV_8683_gamma_dispersion_epsilon.py Project: timgates42/h2o-3

def test_dispersion_epsilon():
    training_data = h2o.import_file(
        "http://h2o-public-test-data.s3.amazonaws.com/smalldata/glm_test/gamma_dispersion_factor_9_10kRows.csv"
    )
    Y = 'resp'
    x = ['abs.C1.', 'abs.C2.', 'abs.C3.', 'abs.C4.', 'abs.C5.']
    model = H2OGeneralizedLinearEstimator(family='gamma',
                                          lambda_=0,
                                          compute_p_values=True,
                                          dispersion_factor_method="ml")
    model.train(training_frame=training_data, x=x, y=Y)
    model_short = H2OGeneralizedLinearEstimator(family='gamma',
                                                lambda_=0,
                                                compute_p_values=True,
                                                dispersion_factor_method="ml",
                                                dispersion_epsilon=1e-1)
    model_short.train(training_frame=training_data, x=x, y=Y)
    model_long = H2OGeneralizedLinearEstimator(family='gamma',
                                               lambda_=0,
                                               compute_p_values=True,
                                               dispersion_factor_method="ml",
                                               dispersion_epsilon=1e-4)
    model_long.train(training_frame=training_data, x=x, y=Y)
    true_dispersion_factor = 9
    assert abs(true_dispersion_factor-model_long._model_json["output"]["dispersion"]) <= abs(model_short._model_json["output"]["dispersion"]-true_dispersion_factor), \
    "H2O dispersion parameter estimate with epsilon 1r-4 {0} is worse than that of dispersion_epsilon 0.1 {1}.  True dispersion parameter is " \
    "{2}".format( model_long._model_json["output"]["dispersion"], model_short._model_json["output"]["dispersion"], true_dispersion_factor)

Example #7

0

Show file

File: pyunit_PUBDEV_6424_ordinal_makeGLMModel.py Project: zoudongyang/h2o-3

def testOrdinalLogit():
    Dtrain = h2o.import_file(
        pyunit_utils.locate(
            "bigdata/laptop/glm_ordinal_logit/ordinal_ordinal_20_training_set.csv"
        ))
    Dtrain["C21"] = Dtrain["C21"].asfactor()

    print("Fit model on dataset")
    model = glm(family="ordinal",
                alpha=[0.5],
                lambda_=[0.001],
                max_iterations=1000,
                beta_epsilon=1e-8,
                objective_epsilon=1e-8)
    model.train(x=list(range(0, 20)), y="C21", training_frame=Dtrain)
    predH2O = model.predict(Dtrain)
    r = glm.getGLMRegularizationPath(model)
    m2 = glm.makeGLMModel(
        model=model, coefs=r['coefficients']
        [0])  # model generated from setting coefficients to model
    f2 = m2.predict(Dtrain)
    pyunit_utils.compare_frames_local(predH2O, f2, prob=1)
    coefs = r['coefficients'][0]
    coefs['h2o_dream'] = 3.1415

    try:
        glm.makeGLMModel(model=model, coefs=coefs)
        assert False, "Should have thrown an exception!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
               ("model coefficient length 189 is different from coefficient provided by user ") in temp, \
            "Wrong exception was received."
        print("coefficient test passed!")

Example #8

0

Show file

File: pyunit_glm_makeGLMModel_PUBDEV_5442_gaussian.py Project: zoudongyang/h2o-3

def test_makeGLMModel():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    m = glm(family='gaussian', Lambda=[0.001], alpha=[0.5])
    m.train(training_frame=d, x=myX, y=myY)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0])
    f1 = m.predict(d)  # predict with original model
    f2 = m2.predict(d)  # predict with model out of makeGLMModel
    pyunit_utils.compare_frames_local(f1, f2, prob=1)
    coefs = r['coefficients'][0]
    coefs['wendy_dreams'] = 8

    try:
        glm.makeGLMModel(model=m, coefs=coefs)
        assert False, "Should have throw exception of bad coefficient length"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
               ("model coefficient length 9 is different from coefficient provided by user ") in temp, \
            "Wrong exception was received."
        print("coefficient test passed!")

Example #9

0

Show file

File: pyunit_glm_makeGLMModel_PUBDEV_5442.py Project: zoudongyang/h2o-3

def test_makeGLMModel():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    m = glm(family='binomial',
            Lambda=[0.001],
            alpha=[0.5],
            solver='COORDINATE_DESCENT')
    m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0])
    f1 = m.predict(d)  # predict with original model
    f2 = m2.predict(d)  # predict with model out of makeGLMModel
    pyunit_utils.compare_frames_local(f1[1], f2[1], prob=1)
    coefs = r['coefficients'][0]
    coefs['wendy_dreams'] = 8

    try:
        glm.makeGLMModel(model=m, coefs=coefs)
        assert False, "Test failed: should have throw exception of bad coefficient length!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
               ("model coefficient length 8 is different from coefficient provided by user ") in temp,\
            "Wrong exception was received."
        print("makeGLMModel test passed!")

Example #10

0

Show file

File: pyunit_PUBDEV_7890_glm_checkpoint_IRLSM_gaussian_lambda_search.py Project: zoudongyang/h2o-3

def buildModelCheckpointing(training_frame, x_indices, y_index, family, solver,
                            cold_start):
    split_frames = training_frame.split_frame(ratios=[0.9], seed=12345)
    model = H2OGeneralizedLinearEstimator(family=family,
                                          max_iterations=3,
                                          solver=solver,
                                          lambda_search=True,
                                          cold_start=cold_start)
    model.train(training_frame=split_frames[0],
                x=x_indices,
                y=y_index,
                validation_frame=split_frames[1])
    modelCheckpoint = H2OGeneralizedLinearEstimator(family=family,
                                                    checkpoint=model.model_id,
                                                    solver=solver,
                                                    lambda_search=True,
                                                    cold_start=cold_start)
    modelCheckpoint.train(training_frame=split_frames[0],
                          x=x_indices,
                          y=y_index,
                          validation_frame=split_frames[1])
    modelLong = H2OGeneralizedLinearEstimator(family=family,
                                              solver=solver,
                                              lambda_search=True,
                                              cold_start=cold_start)
    modelLong.train(training_frame=split_frames[0],
                    x=x_indices,
                    y=y_index,
                    validation_frame=split_frames[1])

    pyunit_utils.assertEqualCoeffDicts(modelCheckpoint.coef(),
                                       modelLong.coef(),
                                       tol=1e-6)

Example #11

0

Show file

def glm_alpha_lambda_arrays_cv():
    print("Testing glm cross-validation with alpha array, lambda array for binomial models.")
    h2o_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C21"
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]
    
    # choices made in model_all and model_xval should be the same since they should be using xval metrics
    model_all = glm(family="gaussian", Lambda=[0.1,0.5,0.9], alpha=[0.1,0.5,0.9], nfolds=3, cold_start=True)
    model_all.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    model_all_rpath = glm.getGLMRegularizationPath(model_all)
    model_xval =  glm(family="gaussian", Lambda=[0.1,0.5,0.9], alpha=[0.1,0.5,0.9], nfolds=3, cold_start=True)
    model_xval.train(x=myX, y=myY, training_frame = training_data)
    model_xval_rpath = glm.getGLMRegularizationPath(model_xval)

    for l in range(0,len(model_all_rpath['lambdas'])):
        print("comparing coefficients for submodel {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(model_all_rpath['coefficients'][l], model_xval_rpath['coefficients'][l], tol=1e-6)
        pyunit_utils.assertEqualCoeffDicts(model_all_rpath['coefficients_std'][l], model_xval_rpath['coefficients_std'][l], tol=1e-6)

Example #12

0

Show file

def buildModelCheckpointing(training_frame, x_indices, y_index, family,
                            solver):
    split_frames = training_frame.split_frame(ratios=[0.9], seed=12345)
    model = H2OGeneralizedLinearEstimator(family=family,
                                          max_iterations=7,
                                          solver=solver)
    model.train(training_frame=split_frames[0],
                x=x_indices,
                y=y_index,
                validation_frame=split_frames[1])
    modelCheckpoint = H2OGeneralizedLinearEstimator(family=family,
                                                    checkpoint=model.model_id,
                                                    solver=solver)
    modelCheckpoint.train(training_frame=split_frames[0],
                          x=x_indices,
                          y=y_index,
                          validation_frame=split_frames[1])

    modelLong = H2OGeneralizedLinearEstimator(
        family=family, solver=solver)  # allow to run to completion
    modelLong.train(training_frame=split_frames[0],
                    x=x_indices,
                    y=y_index,
                    validation_frame=split_frames[1])

    pyunit_utils.assertEqualCoeffDicts(modelCheckpoint.coef(),
                                       modelLong.coef(),
                                       tol=5e-2)

Example #13

0

Show file

File: pyunit_PUBDEV_8075_core_infogram_iris_x_att.py Project: wwjiang007/h2o-3

def test_infogram_iris_x_attributes():
    """
    Test to showcase that we can specify predictors using infogram model
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/irisROriginal.csv"))
    target = "Species"
    fr[target] = fr[target].asfactor()
    x = fr.names
    x.remove(target)

    infogram_model = H2OInfogram(
        seed=12345, distribution='multinomial'
    )  # build infogram model with default settings
    infogram_model.train(x=x, y=target, training_frame=fr)

    glm_model1 = H2OGeneralizedLinearEstimator(family='multinomial')
    glm_model1.train(x=infogram_model._extract_x_from_model(),
                     y=target,
                     training_frame=fr)
    coef1 = glm_model1.coef()
    glm_model2 = H2OGeneralizedLinearEstimator(family='multinomial')
    glm_model2.train(x=infogram_model, y=target, training_frame=fr)
    coef2 = glm_model2.coef()
    coef_classes = coef1.keys()
    for key in coef_classes:
        pyunit_utils.assertCoefDictEqual(coef1[key], coef2[key], tol=1e-6)

Example #14

0

Show file

def interactions_GLM_Binomial():
    pd_df = pd.DataFrame(
        np.array([[
            0.1, 0.2, 0.3, 0.15, 0.25, 0.35, 0.12, 0.22, 0.32, 0.2, 0.3, 0.15,
            0.05
        ], ["a", "a", "a", "b", "b", "b", "c", "c", "c", "a", "a", "a", "b"],
                  [
                      "Red", "Blue", "Green", "Red", "Blue", "Green",
                      "Red", "Blue", "Green", "Blue", "Green", "Red", "Blue"
                  ]]).T,
        columns=['label', 'categorical_feat', 'categorical_feat2'])
    h2o_df = h2o.H2OFrame(pd_df, na_strings=["UNKNOWN"])

    interaction_pairs = ["categorical_feat", "categorical_feat2"]

    # build model with and without NA in Frame
    model0 = H2OGeneralizedLinearEstimator(family="Gaussian",
                                           Lambda=0,
                                           interactions=interaction_pairs)
    model0.train(x=["categorical_feat", "categorical_feat2"],
                 y='label',
                 training_frame=h2o_df)

    model1 = H2OGeneralizedLinearEstimator(family="Gaussian",
                                           Lambda=0.001,
                                           interactions=interaction_pairs)
    model1.train(x=["categorical_feat", "categorical_feat2"],
                 y='label',
                 training_frame=h2o_df)
    model0CoeffLen = 4 + 2 + 2 + 1  # interaction 4 levels, 2 enums 2 levels each plus intercept due to use_all_factor_level=F
    model1CoeffLen = 9 + 3 + 3 + 1  # interaction 9 levels, 2 enums 3 levels each plus intercept
    assert len(model0.coef()) == model0CoeffLen, "Lambda=0, Expected coefficient length: {0}, Actual: " \
                                                    "{1}".format(model0CoeffLen, len(model0.coef()))
    assert len(model1.coef()) == model1CoeffLen, "Lambda=0.001, Expected coefficient length: {0}, Actual: " \
                                                    "{1}".format(model1CoeffLen, len(model1.coef()))

Example #15

0

Show file

File: pyunit_PUBDEV_7835_verify_cold_start_lambda_search.py Project: zoudongyang/h2o-3

def grab_lambda_min():
    boston = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/BostonHousing.csv"))

    # set the predictor names and the response column name
    predictors = boston.columns[:-1]
    # set the response column to "medv", the median value of owner-occupied homes in $1000's
    response = "medv"

    # convert the chas column to a factor (chas = Charles River dummy variable (= 1 if tract bounds river; 0 otherwise))
    boston['chas'] = boston['chas'].asfactor()

    # split into train and validation sets
    train, valid = boston.split_frame(ratios = [.8], seed=1234)
    boston_glm = H2OGeneralizedLinearEstimator(lambda_search = True, seed=1234, cold_start=True)
    boston_glm.train(x = predictors, y = response, training_frame = train, validation_frame = valid)   
    r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(boston_glm)

    for l in range(0,len(r['lambdas'])):
        m = H2OGeneralizedLinearEstimator(alpha=[r['alphas'][l]],Lambda=r['lambdas'][l],
                                          solver='COORDINATE_DESCENT')
        m.train(x = predictors, y = response, training_frame = train, validation_frame = valid)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        print("comparing coefficients for submodel {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-6)
        pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), tol=1e-6)

Example #16

0

Show file

File: pyunit_covtype_get_future_model.py Project: sudarshan4455/h2o-3

def test_get_future_model():
  covtype=h2o.upload_file(pyunit_utils.locate("smalldata/covtype/covtype.altered.gz"))

  myY=54
  myX=list(set(range(54)) - set([20,28]))   # Cols 21 and 29 are constant, so must be explicitly ignored

  # Set response to be indicator of a particular class
  res_class=random.sample(range(1,5), 1)[0]
  covtype[myY] = covtype[myY] == res_class
  covtype[myY] = covtype[myY].asfactor()

  # L2: alpha=0, lambda=0

  covtype_h2o1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=0)
  covtype_h2o1.start(x=myX, y=myY, training_frame=covtype)

  # Elastic: alpha=0.5, lambda=1e-4
  covtype_h2o2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=1e-4)
  covtype_h2o2.start(x=myX, y=myY, training_frame=covtype)

  # L1: alpha=1, lambda=1e-4
  covtype_h2o3 = H2OGeneralizedLinearEstimator(family="binomial", alpha=1, Lambda=1e-4)
  covtype_h2o3.start(x=myX, y=myY, training_frame=covtype)

  covtype_h2o1.join()
  print(covtype_h2o1)
  covtype_h2o2.join()
  print(covtype_h2o2)
  covtype_h2o3.join()
  print(covtype_h2o3)

Example #17

0

Show file

File: pyunit_PUBDEV_6424_multinomial_makeGLMModel.py Project: vishalbelsare/h2o-3

def test_glm_multinomial_makeGLMModel():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    mL = glm(family='multinomial', alpha=[0.1], Lambda=[0.9])
    d[54] = d[54].asfactor()
    mL.train(training_frame=d, x=list(range(0, 54)), y=54)
    r = glm.getGLMRegularizationPath(mL)
    rank = check_nonzero_coefs(r['coefficients'][0])
    assert rank == mL._model_json["output"]["rank"], "expected rank: {0}, actual rank: {1}." \
                                                     "".format(rank, mL._model_json["output"]["rank"])
    m2 = glm.makeGLMModel(
        model=mL, coefs=r['coefficients']
        [0])  # model generated from setting coefficients to model
    f1 = mL.predict(d)
    f2 = m2.predict(d)
    pyunit_utils.compare_frames_local(f1, f2, prob=1)

    coefs = r['coefficients'][0]
    coefs[
        "wendy_dreams"] = 0.123  # add extra coefficients to model coefficient

    try:
        glm.makeGLMModel(model=mL, coefs=coefs)
        assert False, "Should have thrown an exception!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
           ("model coefficient length 371 is different from coefficient provided by user") in temp, \
            "Wrong exception was received."
        print("glm Multinomial makeGLMModel test completed!")

Example #18

0

Show file

def test_relevel():
    #First, compare againts itself
    print("Importing prostate_cat.csv data...\n")
    d = h2o.import_file(path = pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA","NA","NA","NA","NA","NA","NA","NA"])

    mh2o1 = H2OGeneralizedLinearEstimator(family = "binomial", Lambda=0, missing_values_handling = "Skip")
    mh2o1.train(x=list(range(1, d.ncol)), y=0, training_frame=d)
    ns = mh2o1.coef().keys()
    print(ns)
    assert "DPROS.None" in ns, "None level IS NOT expected to be skipped by default"
    assert "DPROS.Both" not in ns, "Both level IS expected to be skipped by default"
    x = d["DPROS"].relevel("None")
    print(x)
    d["DPROS"] = x[0]

    mh2o2 = H2OGeneralizedLinearEstimator(family = "binomial", Lambda=0, missing_values_handling = "Skip")
    mh2o2.train(x=list(range(1, d.ncol)), y=0, training_frame=d)
    ns2 = mh2o2.coef().keys()
    print(ns2)
    assert "DPROS.None" not in ns2, "None level IS NOT expected to be skipped by default"
    assert "DPROS.Both" in ns2, "Both level IS expected to be skipped by default"

    #Second, compare against R input (taken from runit_relevel.R)
    dr = h2o.import_file(path = pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    dr["DPROS"] = d["DPROS"].relevel("None")
    #Results are from R but manualy reordered and renamed to match h2o naming and order
    exp_coefs = {"Intercept": -7.63245 , "DPROS.Both": 1.39185, "DPROS.Left": 0.73482, "DPROS.Right": 1.51437, "RACE.White": 0.65160, "DCAPS.Yes": 0.49233,
                 "AGE":-0.01189 , "PSA": 0.02990, "VOL": -0.01141, "GLEASON": 0.96466927}
    coeff_diff = {key: abs(exp_coefs[key] - mh2o2.coef().get(key, 0)) for key in exp_coefs.keys()}
    assert max(coeff_diff.values()) < 1e-4

Example #19

0

Show file

File: pyunit_PUBDEV_8075_safe_infogram_personal_loan_x_att.py Project: wwjiang007/h2o-3

def test_infogram_personal_loan():
    """
    Test to make sure predictor can be specified using infogram model. 
    """
    fr = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv"))
    target = "Personal Loan"
    fr[target] = fr[target].asfactor()
    x = [
        "Experience", "Income", "Family", "CCAvg", "Education", "Mortgage",
        "Securities Account", "CD Account", "Online", "CreditCard"
    ]
    infogram_model = H2OInfogram(seed=12345,
                                 protected_columns=["Age", "ZIP Code"])
    infogram_model.train(x=x, y=target, training_frame=fr)

    glm_model1 = H2OGeneralizedLinearEstimator()
    glm_model1.train(x=infogram_model._extract_x_from_model(),
                     y=target,
                     training_frame=fr)
    coef1 = glm_model1.coef()
    glm_model2 = H2OGeneralizedLinearEstimator()
    glm_model2.train(x=infogram_model, y=target, training_frame=fr)
    coef2 = glm_model2.coef()

    pyunit_utils.assertCoefDictEqual(coef1, coef2, tol=1e-6)

Example #20

0

Show file

File: pyunit_covtype_glm.py Project: sudarshan4455/h2o-3

def covtype():
  covtype = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
  #
  myY = 54
  myX = [x for x in range(0,54) if x not in [20,28]]

  # Set response to be indicator of a particular class
  res_class = random.randint(1,4)
  covtype[54] = (covtype[54] == res_class)

  #covtype.summary()


  # L2: alpha = 0, lambda = 0
  covtype_mod1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=0)
  covtype_mod1.train(x=myX, y=myY, training_frame=covtype)
  covtype_mod1.show()

  # Elastic: alpha = 0.5, lambda = 1e-4
  covtype_mod2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=1e-4)
  covtype_mod2.train(x=myX, y=myY, training_frame=covtype)
  covtype_mod2.show()

  # L1: alpha = 1, lambda = 1e-4
  covtype_mod3 = H2OGeneralizedLinearEstimator(family="binomial", alpha=1, Lambda=1e-4)
  covtype_mod3.train(x=myX, y=myY, training_frame=covtype)
  covtype_mod3.show()

Example #21

0

Show file

File: pyunit_PUBDEV_6424_negbinomial_makeGLMModel.py Project: zoudongyang/h2o-3

def test_negBinomial_makeGLMModel():
    print("Read in prostate data.")
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    print("Testing for family: Negative Binomial")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    thetas = [0.000000001, 0.01, 0.1, 0.5, 1]
    for thetaO in thetas:
        h2o_model_log = H2OGeneralizedLinearEstimator(
            family="negativebinomial",
            link="log",
            alpha=0.5,
            Lambda=0.0001,
            theta=thetaO)
        h2o_model_log.train(x=myX, y=myY, training_frame=h2o_data)
        predictModel = h2o_model_log.predict(h2o_data)
        r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(
            h2o_model_log)
        makeModel = H2OGeneralizedLinearEstimator.makeGLMModel(
            model=h2o_model_log, coefs=r['coefficients']
            [0])  # model generated from setting coefficients to model
        predictMake = makeModel.predict(h2o_data)
        pyunit_utils.compare_frames_local(predictModel, predictMake, prob=1)

Example #22

0

Show file

def glm_alpha_array_lambda_null():
    # first test: compare coefficients and deviance
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    mL = glm(family='binomial',alpha=[0.1,0.5,0.9],solver='COORDINATE_DESCENT')
    mL.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
    r = glm.getGLMRegularizationPath(mL)
    regKeys = ["alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train"]
    best_submodel_index = mL._model_json["output"]["best_submodel_index"]
    m2 = glm.makeGLMModel(model=mL,coefs=r['coefficients'][best_submodel_index])
    dev1 = r['explained_deviance_train'][best_submodel_index]
    p2 = m2.model_performance(d)
    dev2 = 1-p2.residual_deviance()/p2.null_deviance()
    print(dev1," =?= ",dev2)
    assert abs(dev1 - dev2) < 1e-6
    for l in range(0,len(r['lambdas'])):
        m = glm(family='binomial',alpha=[r['alphas'][l]],Lambda=[r['lambdas'][l]],solver='COORDINATE_DESCENT')
        m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
        mr = glm.getGLMRegularizationPath(m)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        pyunit_utils.assertEqualCoeffDicts(cs, m.coef())
        pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm())
        p = m.model_performance(d)
        devm = 1-p.residual_deviance()/p.null_deviance()
        devn = r['explained_deviance_train'][l]
        assert abs(devm - devn) < 1e-4
        pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr)
        if (l == best_submodel_index): # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(m._model_json["output"]["training_metrics"],
                                                 mL._model_json["output"]["training_metrics"])
        else: # for other submodel, should have worse residual_deviance() than best submodel
            assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \
                                                                    "residual_deviance()!"

Example #23

0

Show file

def offset_init_train_glm():
    # Connect to a pre-existing cluster
    cars = h2o.upload_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    cars = cars[cars["economy_20mpg"].isna() == 0]
    cars["economy_20mpg"] = cars["economy_20mpg"].asfactor()
    offset = h2o.H2OFrame([[.5]] * 398)
    offset.set_names(["x1"])
    cars = cars.cbind(offset)

    # offset_column passed in the train method
    glm_train = H2OGeneralizedLinearEstimator(family="binomial")
    glm_train.train(x=list(range(2, 8)),
                    y="economy_20mpg",
                    training_frame=cars,
                    offset_column="x1")
    predictions_train = glm_train.predict(cars)

    # offset_column passed in estimator init
    glm_init = H2OGeneralizedLinearEstimator(offset_column="x1",
                                             family="binomial")
    glm_init.train(x=list(range(2, 8)), y="economy_20mpg", training_frame=cars)
    predictions_init = glm_init.predict(cars)

    # case the both offset column parameters are set and only the parameter in train will be used
    glm_init_train = H2OGeneralizedLinearEstimator(offset_column="x1-test",
                                                   family="binomial")
    glm_init_train.train(x=list(range(2, 8)),
                         y="economy_20mpg",
                         training_frame=cars,
                         offset_column="x1")
    predictions_init_train = glm_init_train.predict(cars)

    assert predictions_train == predictions_init, "Expected predictions of a model with offset_column in train method has to be same as predictions of a model with offset_column in constructor."
    assert predictions_train == predictions_init_train, "Expected predictions of a model with offset_column in train method has to be same as predictions of a model with offset_column in both constructor and init."

Example #24

0

Show file

File: pyunit_benign_glm_grid.py Project: sudarshan4455/h2o-3

def benign_grid():
    training_data = h2o.import_file(
        pyunit_utils.locate("smalldata/logreg/benign.csv"))

    Y = 3
    X = range(3) + range(4, 11)

    hyper_parameters = {'alpha': [0.01, 0.5, 'a'], 'lambda': [1e-5, 1e-6]}
    gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'),
                       hyper_parameters)
    gs.train(x=X, y=Y, training_frame=training_data)
    gs.show()
    print gs.sort_by('F1', False)
    best_model_id = gs.sort_by('F1', False)['Model Id'][0]
    best_model = h2o.get_model(best_model_id)
    best_model.predict(training_data)
    gs.predict(training_data)
    print gs.get_hyperparams(best_model_id)
    print gs.grid_id

    new_g = H2OGridSearch.get_grid(
        H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters,
        gs.grid_id)
    new_g.show()
    print new_g.grid_id
    print new_g.sort_by('F1', False)

    assert best_model.params['family']['actual'] == 'binomial'

Example #25

0

Show file

File: pyunit_PUBDEV_7481_lambda_search_alpha_array_multinomial_cv.py Project: bhorkar/h2o-3

def glm_alpha_array_with_lambda_search_cv():
    # read in the dataset and construct training set (and validation set)
    print("Testing glm cross-validation with alpha array, lambda_search for multiomial models.")
    h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"))
    enum_columns = ["C1", "C2", "C3", "C4", "C5"]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C11"
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]
    # build model with CV but no validation dataset
    cv_model = glm(family='multinomial',alpha=[0.1,0.5,0.9], lambda_search=True, nfolds = 3)
    cv_model.train(training_frame=training_data,x=myX,y=myY)
    cv_r = glm.getGLMRegularizationPath(cv_model)
    # build model with CV and with validation dataset
    cv_model_valid = glm(family='multinomial',alpha=[0.1,0.5,0.9], lambda_search=True, nfolds = 3)
    cv_model_valid.train(training_frame=training_data, validation_frame = test_data, x=myX,y=myY)
    cv_r_valid = glm.getGLMRegularizationPath(cv_model_valid)

    for l in range(0,len(cv_r['lambdas'])):
        print("comparing coefficients for submodel {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients'][l], cv_r_valid['coefficients'][l], tol=1e-6)
        pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients_std'][l], cv_r_valid['coefficients_std'][l], tol=1e-6)

Example #26

0

Show file

File: pyunit_NOPASS_weightsGLM.py Project: kordikp/AutoMLprediction

    def check_same(data1, data2):
        glm1_regression = H2OGeneralizedLinearEstimator()
        glm1_regression.train(x=list(range(2, 20)), y=1, training_frame=data1)
        #   glm1_regression = h2o.glm(x=data1[2:20], y=data1[1])

        glm2_regression = H2OGeneralizedLinearEstimator(
            weights_column="weights")
        glm2_regression.train(x=list(range(2, 21)), y=1, training_frame=data2)
        #   glm2_regression = h2o.glm(x=data2[2:21], y=data2[1], weights_column="weights", training_frame=data2)
        glm1_binomial = H2OGeneralizedLinearEstimator()
        glm1_binomial.train(x=list(range(1, 20)), y=0, training_frame=data1)
        #    glm1_binomial = h2o.glm(x=data1[1:20], y=data1[0], family="binomial")
        glm2_binomial = H2OGeneralizedLinearEstimator(weights_column="weights",
                                                      family="binomial")
        glm2_binomial.train(x=list(range(1, 21)), y=0, training_frame=data2)
        #    glm2_binomial = h2o.glm(x=data2[1:21], y=data2[0], weights_column="weights", family="binomial",training_frame=data2)

        assert abs(glm1_regression.mse() - glm2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \
                                                                          "and {1}".format(glm1_regression.mse(),
                                                                                           glm2_regression.mse())
        assert abs(glm1_binomial.null_deviance() - glm2_binomial.null_deviance()) < 1e-6, \
            "Expected null deviances to be the same, but got {0}, and {1}".format(glm1_binomial.null_deviance(),
                                                                                  glm2_binomial.null_deviance())
        assert abs(glm1_binomial.residual_deviance() - glm2_binomial.residual_deviance()) < 1e-6, \
            "Expected residual deviances to be the same, but got {0}, and {1}".format(glm1_binomial.residual_deviance(),
                                                                                      glm2_binomial.residual_deviance())

Example #27

0

Show file

File: pyunit_pubdev_8683_gamma_dispersion_factor.py Project: timgates42/h2o-3

def test_gamma_dispersion_factor():
    training_data = h2o.import_file(
        "http://h2o-public-test-data.s3.amazonaws.com/smalldata/glm_test/gamma_dispersion_factor_9_10kRows.csv"
    )
    Y = 'resp'
    x = ['abs.C1.', 'abs.C2.', 'abs.C3.', 'abs.C4.', 'abs.C5.']
    model = H2OGeneralizedLinearEstimator(family='gamma',
                                          lambda_=0,
                                          compute_p_values=True,
                                          dispersion_factor_method="ml")
    model.train(training_frame=training_data, x=x, y=Y)
    model_pearson = H2OGeneralizedLinearEstimator(
        family='gamma',
        lambda_=0,
        compute_p_values=True,
        dispersion_factor_method="pearson")
    model_pearson.train(training_frame=training_data, x=x, y=Y)
    true_dispersion_factor = 9
    R_dispersion_factor = 9.3
    dispersion_factor_estimated = model._model_json["output"]["dispersion"]
    dispersion_factor_estimated_pearson = model_pearson._model_json["output"][
        "dispersion"]
    print(
        "True dispersion parameter {0}.  Estiamted ml dispersion parameter {1}.  Estimated pearson dispersion "
        "parameter {2}.".format(true_dispersion_factor,
                                dispersion_factor_estimated,
                                dispersion_factor_estimated_pearson))
    assert abs(true_dispersion_factor-dispersion_factor_estimated) <= abs(R_dispersion_factor-true_dispersion_factor),\
        "H2O dispersion parameter estimate {0} is worse than that of R {1}.  True dispersion parameter is " \
        "{2}".format( dispersion_factor_estimated, R_dispersion_factor, true_dispersion_factor)
    assert abs(true_dispersion_factor-dispersion_factor_estimated) <= abs(dispersion_factor_estimated_pearson-true_dispersion_factor), \
        "H2O dispersion parameter ml estimate {0} is worse than that of H2O dispersion parameter pearson estimate {1}." \
        "  True dispersion parameter is {2}".format( dispersion_factor_estimated, dispersion_factor_estimated_pearson,
                                                     true_dispersion_factor)

Example #28

0

Show file

def glm_alpha_arrays_null_lambda_cv():
    print("Testing glm cross-validation with alpha array, default lambda values for binomial models.")
    h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C21"
    h2o_data["C21"] = h2o_data["C21"].asfactor()
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]
    
    # build model with CV but no validation dataset
    cv_model = glm(family='binomial',alpha=[0.1,0.5,0.9], nfolds = 3, fold_assignment="modulo")
    cv_model.train(training_frame=training_data,x=myX,y=myY)
    cv_r = glm.getGLMRegularizationPath(cv_model)
    # build model with CV and with validation dataset
    cv_model_valid = glm(family='binomial',alpha=[0.1,0.5,0.9], nfolds = 3, fold_assignment="modulo")
    cv_model_valid.train(training_frame=training_data, validation_frame = test_data, x=myX,y=myY)
    cv_r_valid = glm.getGLMRegularizationPath(cv_model_valid)

    for l in range(0,len(cv_r['lambdas'])):
        print("comparing coefficients for submodel {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients'][l], cv_r_valid['coefficients'][l], tol=1e-6)
        pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients_std'][l], cv_r_valid['coefficients_std'][l], tol=1e-6)

Example #29

0

Show file

def link_incompatible_error():
    print("Reading in original prostate data.")
    prostate = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate.csv.zip"))

    print(
        "Throw error when trying to create model with incompatible logit link."
    )
    try:
        model = H2OGeneralizedLinearEstimator(family="gaussian", link="logit")
        model.train(x=list(range(1, 8)), y=8, training_frame=prostate)
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        model = H2OGeneralizedLinearEstimator(family="tweedie", link="log")
        model.train(x=list(range(1, 8)), y=8, training_frame=prostate)
        assert False, "expected an error"
    except EnvironmentError:
        assert True

    try:
        model = H2OGeneralizedLinearEstimator(family="binomial",
                                              link="inverse")
        model.train(x=list(range(2, 9)), y=1, training_frame=prostate)
        assert False, "expected an error"
    except EnvironmentError:
        assert True

Example #30

0

Show file

File: pyunit_PUBDEV_7481_lambda_alpha_array_multinomial_coldStart.py Project: zoudongyang/h2o-3

def glm_alpha_array_lambda_null():
    # first test: compare coefficients and deviance
    keySets = ["MSE", "null_deviance", "logloss", "RMSE", "r2"]
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    mL = glm(family='multinomial',
             alpha=[0.1, 0.5, 0.9],
             Lambda=[0.1, 0.5, 0.9],
             cold_start=True)
    d[54] = d[54].asfactor()
    mL.train(training_frame=d, x=list(range(0, 54)), y=54)
    r = glm.getGLMRegularizationPath(mL)
    regKeys = [
        "alphas", "lambdas", "explained_deviance_valid",
        "explained_deviance_train"
    ]
    best_submodel_index = mL._model_json["output"]["best_submodel_index"]
    coefClassSet = [
        'coefs_class_0', 'coefs_class_1', 'coefs_class_2', 'coefs_class_3',
        'coefs_class_4', 'coefs_class_5', 'coefs_class_6', 'coefs_class_7'
    ]
    coefClassSetNorm = [
        'std_coefs_class_0', 'std_coefs_class_1', 'std_coefs_class_2',
        'std_coefs_class_3', 'std_coefs_class_4', 'std_coefs_class_5',
        'std_coefs_class_6', 'std_coefs_class_7'
    ]
    groupedClass = d.group_by("C55")
    groupedClass.count()
    classFrame = groupedClass.get_frame()
    classProb = classFrame[1] / d.nrow
    coeffIndex = [52, 105, 158, 211, 264, 317, 370]
    startVal = [0] * 371
    for ind in range(classProb.nrow):
        startVal[coeffIndex[ind]] = math.log(classProb[ind, 0])

    for l in range(0, len(r['lambdas'])):
        m = glm(family='multinomial',
                alpha=[r['alphas'][l]],
                Lambda=[r['lambdas'][l]],
                startval=startVal)
        m.train(training_frame=d, x=list(range(0, 54)), y=54)
        mr = glm.getGLMRegularizationPath(m)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        pyunit_utils.assertCoefEqual(cs, m.coef(), coefClassSet)
        pyunit_utils.assertCoefEqual(cs_norm, m.coef_norm(), coefClassSetNorm)
        devm = 1 - m.residual_deviance() / m.null_deviance()
        devn = r['explained_deviance_train'][l]
        assert abs(devm - devn) < 1e-4
        pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr)
        if (l == best_submodel_index
            ):  # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(
                m._model_json["output"]["training_metrics"],
                mL._model_json["output"]["training_metrics"],
                tol=1e-2,
                keySet=keySets)
        else:  # for other submodel, should have worse residual_deviance() than best submodel
            assert m.logloss() >= mL.logloss(), "Best submodel does not have lowerest " \
                                                                    "logloss()!"

Example #31

0

Show file

File: Modeling.py Project: msw1535540/db-h2o-spark

def logistic_regression(xval=None, sample_size=None, nfolds=None, hparams=None, for_stacking=None):
    """
    create a logistic regression algorithm estimator
    
    Note:
     1. standardize: True(default)
     3. missing_values_handling: mean_imputation(default)

    :param xval: if for cross-validation
    :param sample_size: training set sample amount
    :param nfolds: k value for k-fold cross-validation
    :param hparams: hyper parameters for grid search
    :param for_stacking: if it is used for stacking
    :return: a constructed logistic regression estimator, a parameters' dict for grid search
    """
    if sample_size <= 10000:
        if sample_size <= 5000:
            default_nfolds = 3
        else:
            default_nfolds = 5
        alpha_opts = [0, 0.25, 0.5, 0.75, 1]
        lambda_opts = [1, 0.5, 0.1, 0.01, 0]

    elif 10000 < sample_size <= 100000:
        default_nfolds = 3
        alpha_opts = [0, 0.5, 1]
        lambda_opts = [1, 0.5, 0.1, 0.01, 0]

    else:
        default_nfolds = 2
        alpha_opts = [0, 0.5, 1]
        lambda_opts = [1, 0.5, 0.1, 0]

    default_hparams = dict({'alpha': alpha_opts, 'lambda': lambda_opts})

    if nfolds is None:
        nfolds = default_nfolds
    if hparams is None:
        hparams = default_hparams

    if xval:
        if for_stacking:
            lr_estimator = H2OGeneralizedLinearEstimator(family="binomial",
                                                         remove_collinear_columns=True,
                                                         max_iterations=50,
                                                         nfolds=nfolds,
                                                         fold_assignment="Modulo",
                                                         seed=1,
                                                         keep_cross_validation_predictions=True)
        else:
            lr_estimator = H2OGeneralizedLinearEstimator(family="binomial",
                                                         remove_collinear_columns=True,
                                                         max_iterations=50,
                                                         nfolds=nfolds)
    else:
        lr_estimator = H2OGeneralizedLinearEstimator(family="binomial",
                                                     remove_collinear_columns=True,
                                                     max_iterations=50)

    return lr_estimator, hparams