def test_gamma_dispersion_factor():
    training_data = h2o.import_file(
        "http://h2o-public-test-data.s3.amazonaws.com/smalldata/glm_test/gamma_dispersion_factor_9_10kRows.csv"
    )
    weight = pyunit_utils.random_dataset_real_only(training_data.nrow,
                                                   1,
                                                   realR=2,
                                                   misFrac=0,
                                                   randSeed=12345)
    weight = weight.abs()
    training_data = training_data.cbind(weight)
    Y = 'resp'
    x = ['abs.C1.', 'abs.C2.', 'abs.C3.', 'abs.C4.', 'abs.C5.']
    model_ml = H2OGeneralizedLinearEstimator(family='gamma',
                                             lambda_=0,
                                             compute_p_values=True,
                                             dispersion_factor_method="ml",
                                             weights_column="abs(C1)")
    model_ml.train(training_frame=training_data, x=x, y=Y)
    true_dispersion_factor = 9
    R_dispersion_factor = 9.3
    dispersion_factor_ml_estimated = model_ml._model_json["output"][
        "dispersion"]
    print(
        "True dispersion parameter {0}.  Estimated ml dispersion parameter {1}"
        ".".format(true_dispersion_factor, dispersion_factor_ml_estimated))
    assert abs(true_dispersion_factor-dispersion_factor_ml_estimated) <= abs(R_dispersion_factor-true_dispersion_factor),\
        "H2O dispersion parameter ml estimate {0} is worse than that of R {1}.  True dispersion parameter is " \
        "{2}".format( dispersion_factor_ml_estimated, R_dispersion_factor, true_dispersion_factor)
def glm_gamma_offset_mojo():
    train = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    y = "DPROS"
    x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL"]
    x_offset = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "C1"]
    params = {'family': "gamma", 'offset_column': "C1"}
    offset = pyunit_utils.random_dataset_real_only(train.nrow,
                                                   1,
                                                   realR=3,
                                                   misFrac=0,
                                                   randSeed=12345)
    train = train.cbind(offset)

    tmpdir = tempfile.mkdtemp()
    glm_gamma_model = pyunit_utils.build_save_model_generic(
        params, x, train, y, "glm", tmpdir)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(glm_gamma_model._id)

    h2o.download_csv(train[x_offset], os.path.join(
        tmpdir, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glm_gamma_model, tmpdir, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(tmpdir, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10)  # compare mojo and model predict
Beispiel #3
0
def random_dataset(nrow,
                   ncol,
                   realFrac=0.4,
                   intFrac=0.3,
                   enumFrac=0.3,
                   factorR=10,
                   integerR=100,
                   responseFactor=1,
                   misFrac=0.01,
                   randSeed=None):
    fractions = dict()
    if (ncol == 1) and (realFrac >= 1.0):
        fractions[
            "real_fraction"] = 1  # Right now we are dropping string columns, so no point in having them.
        fractions["categorical_fraction"] = 0
        fractions["integer_fraction"] = 0
        fractions["time_fraction"] = 0
        fractions[
            "string_fraction"] = 0  # Right now we are dropping string columns, so no point in having them.
        fractions["binary_fraction"] = 0

        return h2o.create_frame(rows=nrow,
                                cols=ncol,
                                missing_fraction=misFrac,
                                has_response=True,
                                response_factors=responseFactor,
                                integer_range=integerR,
                                seed=randSeed,
                                **fractions)

    real_part = pyunit_utils.random_dataset_real_only(nrow,
                                                      (int)(realFrac * ncol),
                                                      misFrac=misFrac,
                                                      randSeed=randSeed)
    cnames = ['c_' + str(ind) for ind in range(real_part.ncol)]
    real_part.set_names(cnames)
    enumFrac = enumFrac + (1 - realFrac) / 2
    intFrac = 1 - enumFrac
    fractions[
        "real_fraction"] = 0  # Right now we are dropping string columns, so no point in having them.
    fractions["categorical_fraction"] = enumFrac
    fractions["integer_fraction"] = intFrac
    fractions["time_fraction"] = 0
    fractions[
        "string_fraction"] = 0  # Right now we are dropping string columns, so no point in having them.
    fractions["binary_fraction"] = 0

    df = h2o.create_frame(rows=nrow,
                          cols=(ncol - real_part.ncol),
                          missing_fraction=misFrac,
                          has_response=True,
                          response_factors=responseFactor,
                          integer_range=integerR,
                          seed=randSeed,
                          **fractions)
    return real_part.cbind(df)
Beispiel #4
0
def gam_ordinal_mojo():
    h2o.remove_all()
    NTESTROWS = 200  # number of test dataset rows
    PROBLEM = "multinomial"
    params = set_params()  # set deeplearning model parameters
    df1 = pyunit_utils.random_dataset(
        PROBLEM, missing_fraction=0.001)  # generate random dataset
    df = pyunit_utils.random_dataset_real_only(nrow=df1.nrow, ncol=3)
    df.set_names(["gam_col1", "gam_col2", "gam_col3"])
    df = df1.cbind(df)
    dfnames = df.names
    # add GAM specific parameters
    params["gam_columns"] = []
    params["scale"] = []
    count = 0
    num_gam_cols = 3  # maximum number of gam columns
    for cname in dfnames:
        if not (cname == 'response') and (str(df.type(cname)) == "real"):
            params["gam_columns"].append(cname)
            params["scale"].append(0.001)
            count = count + 1
            if (count >= num_gam_cols):
                break

    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})
    TMPDIR = tempfile.mkdtemp()
    gamOrdinalModel = pyunit_utils.build_save_model_generic(
        params, x, train, "response", "gam",
        TMPDIR)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(gamOrdinalModel._id)
    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        gamOrdinalModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging