Exemple #1
0
def test_anovaglm_serialization():
    train = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    y = 'CAPSULE'
    x = ['AGE', 'VOL', 'DCAPS']
    train[y] = train[y].asfactor()
    anovaglm_model = anovaglm(family='binomial',
                              lambda_=0,
                              missing_values_handling="skip")
    anovaglm_model.train(x=x, y=y, training_frame=train)

    tmpdir = tempfile.mkdtemp()
    model_path = anovaglm_model.download_model(tmpdir)
    result_frame_filename = os.path.join(tmpdir, "result_frame.csv")
    h2o.download_csv(anovaglm_model.result(), result_frame_filename)

    h2o.remove_all()
    result_frame_original = h2o.import_file(result_frame_filename)
    loaded_anovaglm_model = h2o.load_model(model_path)
    result_frame_loaded = loaded_anovaglm_model.result()
    for cind in list(range(0, result_frame_original.ncols)):
        for rind in list(range(0, result_frame_original.nrows)):
            if result_frame_original.type(cind) == 'real':
                assert abs(result_frame_original[rind, cind]-result_frame_loaded[rind, cind]) < 1e-6, \
                    "Expected: {0}. Actual: {1}".format(result_frame_original[rind, cind], result_frame_loaded[rind, cind])
            else:
                assert result_frame_original[rind, cind]==result_frame_loaded[rind, cind], \
                    "Expected: {0}. Actual: {1}".format(result_frame_original[rind, cind], result_frame_loaded[rind, cind])
def gam_gaussian_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    PROBLEM="gaussian"
    params = set_params()   # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM, missing_fraction=0.001)   # generate random dataset
    dfnames = df.names
    # add GAM specific parameters
    params["gam_columns"] = []
    params["scale"] = []
    count = 0
    num_gam_cols = 3    # maximum number of gam columns
    for cname in dfnames:
        if not(cname == 'response') and (str(df.type(cname)) == "real"):
            params["gam_columns"].append(cname)
            params["scale"].append(0.001)
            count = count+1
            if (count >= num_gam_cols):
                break
    
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    TMPDIR = tempfile.mkdtemp()
    gamGaussianModel = pyunit_utils.build_save_model_generic(params, x, train, "response", "gam", TMPDIR) # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(gamGaussianModel._id)
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(gamGaussianModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10)    # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
def glm_fractional_binomial_mojo_pojo():
    params = set_params()
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/fraction_binommialOrig.csv"))
    test = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/fraction_binommialOrig.csv"))
    x = ["log10conc"]
    y = "y"

    glmModel = pyunit_utils.build_save_model_GLM(
        params, x, train, y)  # build and save mojo model

    MOJONAME = pyunit_utils.getMojoName(glmModel._id)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glmModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmModel, TMPDIR, MOJONAME)
    pred_h2o = pred_h2o.drop(3)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
Exemple #4
0
def deeplearning_mojo_pojo():
    h2o.remove_all()

    params = set_params()  # set deeplearning model parameters
    df = random_dataset(PROBLEM)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    try:
        deeplearningModel = build_save_model(
            params, x, train)  # build and save mojo model
        h2o.download_csv(test[x], os.path.join(
            TMPDIR,
            'in.csv'))  # save test file, h2o predict/mojo use same file
        pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
            deeplearningModel, TMPDIR,
            MOJONAME)  # load model and perform predict
        # pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
        h2o.save_model(deeplearningModel, path=TMPDIR,
                       force=True)  # save model for debugging
        print("Comparing mojo predict and h2o predict...")
        pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-6)
    #  print("Comparing pojo predict and h2o predict...")
    #  pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-6)
    except Exception as ex:
        print("***************  ERROR and type is ")
        print(str(type(ex)))
        print(ex)
        if "AssertionError" in str(
                type(ex)
        ):  # only care if there is an AssertionError, ignore the others
            sys.exit(1)
def pubdev_1431(ip, port):

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        airlines_billion_file_1 = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file_1)
        airlines_billion_1 = h2o.import_file(url)

        airlines_billion_1[30] = airlines_billion_1[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion_1[0:30],
                      y=airlines_billion_1[30],
                      ntrees=1,
                      distribution="bernoulli",
                      max_depth=1)

        predictions = gbm.predict(airlines_billion_1)

        csv = os.path.join(os.getcwd(), "delete.csv")
        h2o.download_csv(predictions, csv)

        airlines_billion_2 = h2o.import_file(csv)
        os.remove(csv)

        r1, c1 = airlines_billion_1.dim
        r2, c2 = airlines_billion_2.dim
        assert r1 == r2 and c1 == c2, "Expect rows to be equal. r1: {0} and r2: {1}. Expect cols to be equal c1: {0} " \
                                      "c2: {1}".format(r1,r2,c1,c2)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
Exemple #6
0
def glm_binomial_mojo_pojo():
    h2o.remove_all()
    NTESTROWS = 200  # number of test dataset rows
    PROBLEM = "binomial"
    params = set_params()  # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})
    TMPDIR = tempfile.mkdtemp()
    glmBinomialModel = pyunit_utils.build_save_model_generic(
        params, x, train, "response", "glm",
        TMPDIR)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(glmBinomialModel._id)

    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glmBinomialModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmBinomialModel, TMPDIR, MOJONAME)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def run_comparison_tests(auto_encoder, act_fun, missing_values_handling, set_all_factor, train, test, x):
    # set deeplearning model parameters
    params = set_params(act_fun, missing_values_handling, set_all_factor, auto_encoder) 
    
    if auto_encoder:
        try:
            # build and save mojo model
            deeplearning_model = build_save_model(params, x, train) 
        except Exception as err:
            if not("Trying to predict with an unstable model" in err.args[0]):
                raise Exception('Deeplearning autoencoder model failed to build.  Fix it.')
            return
    else:
        # build and save mojo model
        deeplearning_model = build_save_model(params, x, train) 

    # save test file, h2o predict/mojo use same file
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  
    # load model and perform predict
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearning_model, TMPDIR, MOJONAME)  
    pred_pojo = pyunit_utils.pojo_predict(deeplearning_model, TMPDIR, MOJONAME)
    # save model for debugging
    h2o.save_model(deeplearning_model, path=TMPDIR, force=True)  
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10)
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    df = pyunit_utils.random_dataset("regression")       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]
    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)

    frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor
    glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID)   # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def test_gam_transformed_frame_serialization():
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
    ))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    myX = ["C1", "C2"]
    myY = "C11"
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    h2o_model = H2OGeneralizedAdditiveEstimator(family="multinomial",
                                                gam_columns=["C6", "C7", "C8"],
                                                keep_gam_cols=True,
                                                scale=[1, 1, 1],
                                                num_knots=[5, 5, 5])
    h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
    gam_frame = h2o.get_frame(
        h2o_model._model_json["output"]["gam_transformed_center_key"])
    tmpdir = tempfile.mkdtemp()
    filename = os.path.join(tmpdir, "gamXFrame.csv")
    h2o.download_csv(gam_frame, filename)
    model_path = h2o.save_model(h2o_model, tmpdir)

    h2o.remove_all()
    loaded_model = h2o.load_model(model_path)
    gam_frame_loaded = h2o.get_frame(
        loaded_model._model_json["output"]["gam_transformed_center_key"])
    gam_frame_original = h2o.import_file(filename)
    pyunit_utils.compare_frames_local(gam_frame_loaded[2:15],
                                      gam_frame_original[2:15],
                                      prob=1,
                                      tol=1e-6)
    print("Test completed.")
def runComparisonTests(autoEncoder, actFun, missingValuesHandling,
                       setAllFactor, train, test, x):
    params = set_params(actFun, missingValuesHandling, setAllFactor,
                        autoEncoder)  # set deeplearning model parameters

    if autoEncoder:
        try:
            deeplearningModel = build_save_model(
                params, x, train)  # build and save mojo model
        except Exception as err:
            if not ("Trying to predict with an unstable model" in err.args[0]):
                raise Exception(
                    'Deeplearning autoencoder model failed to build.  Fix it.')
            return
    else:
        deeplearningModel = build_save_model(
            params, x, train)  # build and save mojo model

    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        deeplearningModel, TMPDIR, MOJONAME)  # load model and perform predict
    pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
    h2o.save_model(deeplearningModel, path=TMPDIR,
                   force=True)  # save model for debugging
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o,
                                                   pred_mojo,
                                                   prob=1,
                                                   tol=1e-10)
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo,
                                                   pred_pojo,
                                                   prob=1,
                                                   tol=1e-10)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(range(5000,15001),1)[0]
    dataset_params['cols'] = random.sample(range(10,21),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print "Dataset parameters: {0}".format(dataset_params)

    append_response = False
    distribution = random.sample(['bernoulli','multinomial','gaussian','poisson','tweedie','gamma'], 1)[0]
    if   distribution == 'gaussian':  dataset_params['response_factors'] = 1
    elif distribution == 'bernoulli': dataset_params['response_factors'] = 2
    elif distribution == 'multinomial': dataset_params['response_factors'] = random.randint(3,100)
    else:
        dataset_params['has_response'] = False
        response = h2o.H2OFrame.fromPython([random.randint(1,1000) for r in range(0,dataset_params['rows'])])
        append_response = True
    print "Distribution: {0}".format(distribution)

    train = h2o.create_frame(**dataset_params)
    if append_response:
        train = response.cbind(train)
        train.set_name(0,"response")
    if distribution == 'bernoulli' or distribution == 'multinomial': train['response'] = train['response'].asfactor()
    train = train.impute("response", method="mode")
    print "Training dataset:"
    print train

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train,os.path.join(results_dir,"training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['ntrees'] = random.sample(range(1,21),1)[0]
    if random.randint(0,1): params['max_depth'] = random.sample(range(1,11),1)[0]
    if random.randint(0,1): params['min_rows'] = random.sample(range(1,11),1)[0]
    if random.randint(0,1): params['nbins'] = random.sample(range(2,21),1)[0]
    if random.randint(0,1): params['nbins_cats'] = random.sample(range(2,1025),1)[0]
    if random.randint(0,1): params['learn_rate'] = random.random()
    params['distribution'] = distribution
    print "Parameter list: {0}".format(params)

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="gbm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def glm_gamma_offset_mojo():
    train = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    y = "DPROS"
    x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL"]
    x_offset = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "C1"]
    params = {'family': "gamma", 'offset_column': "C1"}
    offset = pyunit_utils.random_dataset_real_only(train.nrow,
                                                   1,
                                                   realR=3,
                                                   misFrac=0,
                                                   randSeed=12345)
    train = train.cbind(offset)

    tmpdir = tempfile.mkdtemp()
    glm_gamma_model = pyunit_utils.build_save_model_generic(
        params, x, train, y, "glm", tmpdir)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(glm_gamma_model._id)

    h2o.download_csv(train[x_offset], os.path.join(
        tmpdir, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glm_gamma_model, tmpdir, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(tmpdir, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10)  # compare mojo and model predict
def test_modelselection_backward_serialization():
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    y = "GLEASON"
    x = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]
    # make sure duplicate runs produce same results
    model_backward = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5,
                                      lambda_=0, theta=0.01)
    model_backward.train(training_frame=d, x=x, y=y)
    model_backward2 = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5,
                                    lambda_=0, theta=0.01)
    model_backward2.train(training_frame=d, x=x, y=y)
    result = model_backward.result()    # get result frame
    result2 = model_backward.result()    # get result frame
    pyunit_utils.compare_frames_local(result[2:5], result2[2:5], prob=1.0) # compare result from both models and they should the same

    num_models = result.nrows           # number of models built
    one_model = h2o.get_model(result["model_id"][num_models-1, 0])
    predict_frame = one_model.predict(d)
    tmpdir = tempfile.mkdtemp()
    file_dir = os.path.join(tmpdir, "predict.csv")
    h2o.download_csv(predict_frame, file_dir) # save one scoring frame
    model_path_backward = model_backward.download_model(tmpdir) # store the model

    h2o.remove_all()
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    loaded_backward_model = h2o.load_model(model_path_backward)    
    result_frame_backward = loaded_backward_model.result()

    model_from_frame_backward = h2o.get_model(result_frame_backward["model_id"][num_models-1, 0])
    pred_frame_backward = model_from_frame_backward.predict(d)
    pred_frame_model = h2o.import_file(file_dir)
    pyunit_utils.compare_frames_local(pred_frame_backward, pred_frame_model, prob=1.0)
Exemple #14
0
def gam_binomial_mojo():
    params = set_params()
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    test = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    train["C21"] = train["C21"].asfactor()
    test["C21"] = test["C21"].asfactor()
    x = ["C1"]
    y = "C21"

    TMPDIR = tempfile.mkdtemp()
    gamModel = pyunit_utils.build_save_model_generic(
        params, x, train, y, "gam", TMPDIR)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(gamModel._id)

    h2o.download_csv(test, os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        gamModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(100, 200)), 1)[0]
    dataset_params['cols'] = random.sample(list(range(10, 21)), 1)[0]
    dataset_params['categorical_fraction'] = round(random.random(), 1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(
        left_over - round(random.uniform(0, left_over), 1), 1)
    if dataset_params['integer_fraction'] + dataset_params[
            'categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params[
                'categorical_fraction']:
            dataset_params[
                'integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params[
                'categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0, 0.01)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2, 50)
    print("Dataset parameters: {0}".format(dataset_params))

    train = h2o.create_frame(**dataset_params)

    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(
        train, os.path.join(results_dir, "pca_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0, 1):
        params['max_iterations'] = random.sample(list(range(1, 1000)), 1)[0]
    if random.randint(0, 1):
        params['transform'] = random.sample(
            ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"], 1)[0]
    realNcol = train.ncol - 1
    params['k'] = random.sample(list(range(1, min(realNcol, train.nrow))),
                                1)[0]

    print("Parameter list: {0}".format(params))

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="pca",
                             equality=None,
                             train=train,
                             test=None,
                             x=x,
                             y=y,
                             compile_only=True,
                             **params)
def pubdev_1431(ip, port):

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        airlines_billion_file_1 = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file_1)
        airlines_billion_1 = h2o.import_file(url)

        airlines_billion_1[30] = airlines_billion_1[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion_1[0:30], y=airlines_billion_1[30], ntrees=1, distribution="bernoulli", max_depth=1)

        predictions = gbm.predict(airlines_billion_1)

        csv = os.path.join(os.getcwd(),"delete.csv")
        h2o.download_csv(predictions,csv)

        airlines_billion_2 = h2o.import_file(csv)
        os.remove(csv)

        r1, c1 = airlines_billion_1.dim
        r2, c2 = airlines_billion_2.dim
        assert r1 == r2 and c1 == c2, "Expect rows to be equal. r1: {0} and r2: {1}. Expect cols to be equal c1: {0} " \
                                      "c2: {1}".format(r1,r2,c1,c2)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
def glm_multinomial_mojo_pojo():
    PROBLEM = "multinomial"
    NTESTROWS = 200
    params = set_params()  # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    glmMultinomialModel = pyunit_utils.build_save_model_GLM(
        params, x, train, "response")  # build and save mojo model

    MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glmMultinomialModel, TMPDIR,
        MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR,
                                          MOJONAME)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
Exemple #18
0
def glrm_mojo():
    h2o.remove_all()
    train = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_test.csv"))
    predict_10iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_10iter.csv"))
    predict_1iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_1iter.csv"))

    x = train.names
    transformN = "STANDARDIZE"

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234, init="random")
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # save mojo model
    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    # test and make sure setting the iteration number did not screw up the prediction
    predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100) # save mojo predict
    pred_h2o = h2o.get_frame("GLRMLoading_"+predID)
    print("Comparing mojo x Factor and model x Factor for 100 iterations")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
    predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=1) # save mojo predict
    print("Comparing mojo x Factor and model x Factor for 1 iterations")
    pyunit_utils.compare_frames_local(predict_1iter, pred_mojo, 1, tol=1e-10)
    predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=10) # save mojo predict
    print("Comparing mojo x Factor and model x Factor for 10 iterations")
    pyunit_utils.compare_frames_local(predict_10iter, pred_mojo, 1, tol=1e-10)
def custom_distribution_mojo_test():
    rows = 2000
    df = random_dataset('binomial', verbose=False, NTESTROWS=rows)
    df['response'] = df['response'].asnumeric()
    train = df[rows:, :]
    test = df[:rows, :]
    x = list(set(df.names) - {"response"})

    params = {
        'ntrees': 10,
        'max_depth': 4,
        'distribution': "custom",
        'custom_distribution_func': custom_distribution_bernoulli()
    }

    my_gbm = build_save_model_GBM(params, x, train, "response")
    mojo_name = getMojoName(my_gbm._id)
    tmp_dir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", mojo_name))

    h2o.download_csv(test[x], os.path.join(
        tmp_dir, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = mojo_predict(
        my_gbm, tmp_dir, mojo_name)  # load model and perform predict
    assert compare_frames_local(
        pred_h2o, pred_mojo, returnResult=True
    ), "Predictions from model and MOJO model are not the same."
def deeplearning_mojo_pojo():
    h2o.remove_all()

    params = set_params()   # set deeplearning model parameters
    df = random_dataset(PROBLEM)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    try:
        deeplearningModel = build_save_model(params, x, train) # build and save mojo model
        h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
        pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearningModel, TMPDIR, MOJONAME)  # load model and perform predict
        pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
        h2o.save_model(deeplearningModel, path=TMPDIR, force=True)  # save model for debugging
        print("Comparing mojo predict and h2o predict...")
        pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-10)    # make sure operation sequence is preserved from Tomk
        print("Comparing pojo predict and h2o predict...")
        pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-10)
    except Exception as ex:
        print("***************  ERROR and type is ")
        print(str(type(ex)))
        print(ex)
        if "AssertionError" in str(type(ex)):   # only care if there is an AssertionError, ignore the others
            sys.exit(1)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print("Dataset parameters: {0}".format(dataset_params))

    append_response = False
    family = random.sample(['binomial','gaussian','poisson','tweedie','gamma'], 1)[0]
    if   family == 'binomial':  dataset_params['response_factors'] = 2
    elif family == 'gaussian':  dataset_params['response_factors'] = 1
    else:
        dataset_params['has_response'] = False
        response = h2o.H2OFrame([random.randint(1,1000) for r in range(0,dataset_params['rows'])])
        append_response = True
    print("Family: {0}".format(family))

    train = h2o.create_frame(**dataset_params)
    if append_response:
        train = response.cbind(train)
        train.set_name(0,"response")
    if family == 'binomial': train['response'] = train['response'].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train["response"],os.path.join(results_dir,"glm_dynamic_preimputed_response.log"))
    train = train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(train,os.path.join(results_dir,"glm_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['alpha'] = random.random()
    params['family'] = family
    if params['family'] == "tweedie":
        if random.randint(0,1):
            params['tweedie_variance_power'] = round(random.random()+1,6)
            params['tweedie_link_power'] = 1 - params['tweedie_variance_power']
    print("Parameter list: {0}".format(params))

    x = list(range(1,train.ncol))
    y = "response"

    pyunit_utils.javapredict(algo="glm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def test_csv_parser_column_skip():
    # generate a big frame with all datatypes and save it to csv.  Load it back with different skipped_columns settings
    nrow = 10000
    ncol = 100
    seed = 12345
    frac1 = 0.16
    frac2 = 0.2
    f1 = h2o.create_frame(rows=nrow, cols=ncol, real_fraction=frac1, categorical_fraction=frac1, integer_fraction=frac1,
                          binary_fraction=frac1, time_fraction=frac1, string_fraction=frac2, missing_fraction=0.1,
                          has_response=False, seed=seed)
    tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results"))
    if not (os.path.isdir(tmpdir)):
        os.mkdir(tmpdir)
    savefilenamewithpath = os.path.join(tmpdir, 'in.csv')
    h2o.download_csv(f1, savefilenamewithpath)

    # load in whole dataset
    skip_all = list(range(f1.ncol))
    skip_even = list(range(0, f1.ncol, 2))
    skip_odd = list(range(1, f1.ncol, 2))
    skip_start_end = [0, f1.ncol - 1]
    skip_except_last = list(range(0, f1.ncol - 2))
    skip_except_first = list(range(1, f1.ncol))
    temp = list(range(0, f1.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, f1.ncol // 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(savefilenamewithpath, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(savefilenamewithpath, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_random)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print("Dataset parameters: {0}".format(dataset_params))

    append_response = False
    family = random.sample(['binomial','gaussian','poisson','tweedie','gamma'], 1)[0]
    if   family == 'binomial':  dataset_params['response_factors'] = 2
    elif family == 'gaussian':  dataset_params['response_factors'] = 1
    else:
        dataset_params['has_response'] = False
        response = h2o.H2OFrame([[random.randint(1,1000)] for r in range(0,dataset_params['rows'])])
        append_response = True
    print("Family: {0}".format(family))

    train = h2o.create_frame(**dataset_params)
    if append_response:
        train = response.cbind(train)
        train.set_name(0,"response")
    if family == 'binomial': train['response'] = train['response'].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train["response"],os.path.join(results_dir,"glm_dynamic_preimputed_response.log"))
    train = train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(train,os.path.join(results_dir,"glm_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['alpha'] = random.random()
    params['family'] = family
    if params['family'] == "tweedie":
        if random.randint(0,1):
            params['tweedie_variance_power'] = round(random.random()+1,6)
            params['tweedie_link_power'] = 1 - params['tweedie_variance_power']
    print("Parameter list: {0}".format(params))

    x = list(range(1,train.ncol))
    y = "response"

    pyunit_utils.javapredict(algo="glm", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def test_csv_parser_column_skip():
    # generate a big frame with all datatypes and save it to csv.  Load it back with different skipped_columns settings
    nrow = 10000
    ncol = 100
    seed = 12345
    frac1 = 0.16
    frac2 = 0.2
    f1 = h2o.create_frame(rows=nrow,
                          cols=ncol,
                          real_fraction=frac1,
                          categorical_fraction=frac1,
                          integer_fraction=frac1,
                          binary_fraction=frac1,
                          time_fraction=frac1,
                          string_fraction=frac2,
                          missing_fraction=0.1,
                          has_response=False,
                          seed=seed)
    tmpdir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results"))
    if not (os.path.isdir(tmpdir)):
        os.mkdir(tmpdir)
    savefilenamewithpath = os.path.join(tmpdir, 'in.csv')
    h2o.download_csv(f1, savefilenamewithpath)

    # load in whole dataset
    skip_all = list(range(f1.ncol))
    skip_start_end = [0, f1.ncol - 1]
    skip_except_last = list(range(0, f1.ncol - 2))
    skip_except_first = list(range(1, f1.ncol))
    temp = list(range(0, f1.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, f1.ncol // 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        importFileSkipAll = h2o.import_file(savefilenamewithpath,
                                            skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(f1, savefilenamewithpath, skip_random)
Exemple #25
0
def javapredict(algo, train, test, x, y, **kwargs):
    print "Creating model in H2O"
    if algo == "gbm":
        model = h2o.gbm(x=train[x], y=train[y], **kwargs)
    elif algo == "random_forest":
        model = h2o.random_forest(x=train[x], y=train[y], **kwargs)
    else:
        raise(ValueError, "algo {0} is not supported".format(algo))
    print model

    print "Downloading Java prediction model code from H2O"
    tmpdir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",model._id))
    os.makedirs(tmpdir)
    h2o.download_pojo(model,path=tmpdir)

    print "Predicting in H2O"
    predictions = model.predict(test)
    predictions.summary()
    predictions.head()
    h2o.download_csv(predictions,os.path.join(tmpdir,"out_h2o.csv"))

    print "Setting up for Java POJO"
    h2o.download_csv(test[x],os.path.join(tmpdir,"in.csv"))
    # hack: the PredictCsv driver can't handle quoted strings, so remove them
    f = open(os.path.join(tmpdir,"in.csv"), 'r+')
    in_csv = f.read()
    in_csv = re.sub('\"', '', in_csv)
    f.seek(0)
    f.write(in_csv)
    f.truncate()
    f.close()

    subprocess.call(["javac", "-cp", os.path.join(tmpdir,"h2o-genmodel.jar"), "-J-Xmx4g", "-J-XX:MaxPermSize=256m", os.path.join(tmpdir,model._id+".java")], stderr=subprocess.STDOUT)
    subprocess.call(["java", "-ea", "-cp", os.path.join(tmpdir,"h2o-genmodel.jar")+":{0}".format(tmpdir), "-Xmx4g", "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id, "--input", os.path.join(tmpdir,"in.csv"), "--output", os.path.join(tmpdir,"out_pojo.csv")], stderr=subprocess.STDOUT)

    predictions2 = h2o.import_file(os.path.join(tmpdir,"out_pojo.csv"))

    print "Comparing predictions between H2O and Java POJO"
    # Dimensions
    hr, hc = predictions.dim
    pr, pc = predictions2.dim
    assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(hr, pr)
    assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(hc, pc)

    # Value
    for r in range(hr):
        hp = predictions[r,0]
        if algo == "gbm":
            pp = float.fromhex(predictions2[r,0])
            assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp)
        elif algo == "random_forest":
            pp = predictions2[r,0]
            assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp)
        else:
            raise(ValueError, "algo {0} is not supported".format(algo))
def javapredict_dynamic_data():

    dataset_params = {}
    dataset_params['rows'] = 13183
    dataset_params['cols'] = 13
    dataset_params['categorical_fraction'] = 0.4
    dataset_params['integer_fraction'] = 0.3
    dataset_params['missing_fraction'] = 0.27539154084819495
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = 819
    print("Dataset parameters: {0}".format(dataset_params))

    problem = 2
    print(
        "Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}"
        .format(problem))
    if problem == 'binomial': dataset_params['response_factors'] = 2
    elif problem == 'regression': dataset_params['response_factors'] = 1
    else: dataset_params['response_factors'] = 16

    train = h2o.create_frame(**dataset_params)
    if problem == 'binomial' or problem == 'multinomial':
        train['response'] = train['response'].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(
        train["response"],
        os.path.join(results_dir, "drf_dynamic_preimputed_response.log"))
    train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(
        train, os.path.join(results_dir, "drf_dynamic_training_dataset.log"))

    params = {}
    params['nbins'] = 5
    params['min_rows'] = 7
    params['mtries'] = 4
    params['sample_rate'] = 0.7867986759373544
    params['seed'] = 1304644573760597606
    print("Parameter list: {0}".format(params))

    x = list(range(1, train.ncol))
    y = "response"

    pyunit_utils.javapredict(algo="random_forest",
                             equality=None,
                             train=train,
                             test=None,
                             x=x,
                             y=y,
                             compile_only=True,
                             **params)
Exemple #27
0
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print("Dataset parameters: {0}".format(dataset_params))

    problem = random.sample(list(range(0,3)),1)
    print("Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}".format(problem))
    if   problem == 'binomial':    dataset_params['response_factors'] = 2
    elif problem == 'regression':  dataset_params['response_factors'] = 1
    else:                          dataset_params['response_factors'] = random.randint(3,100)


    train = h2o.create_frame(**dataset_params)
    if problem == 'binomial' or problem == 'multinomial': train['response'] = train['response'].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train["response"],os.path.join(results_dir,"drf_dynamic_preimputed_response.log"))
    train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(train,os.path.join(results_dir,"drf_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['ntrees'] = random.sample(list(range(1,21)),1)[0]
    if random.randint(0,1): params['max_depth'] = random.sample(list(range(1,11)),1)[0]
    if random.randint(0,1): params['min_rows'] = random.sample(list(range(1,11)),1)[0]
    if random.randint(0,1): params['nbins'] = random.sample(list(range(2,21)),1)[0]
    if random.randint(0,1): params['nbins_cats'] = random.sample(list(range(2,1025)),1)[0]
    if random.randint(0,1): params['mtries'] = random.sample(list(range(1,dataset_params['cols']+1)),1)[0]
    if random.randint(0,1): params['sample_rate'] = random.random()
    print("Parameter list: {0}".format(params))

    x = list(range(1,train.ncol))
    y = "response"

    pyunit_utils.javapredict(algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True,
                             **params)
def impute_data(method = "mean", 
                to_impute = to_impute,
                predictors = predictors):
  if method == "mean":
    print "Mean imputing missing data for predictors:", to_impute
    # find mean for each time period in data for each predictor, save them in a matrix with a col for the mean values of each predictor
    # then on holdout use this table to fill in all missing values based on the time period (row) and the variable (col) of this matrix
    
    #if using python module h2o-3.1.0.3131: grouped = data.group_by(["time_period"])
    #                         gm = [grouped.mean(predictor, na="rm").get_frame() for predictor in to_impute]
    gm = d["time_period"].unique()
    print "Finding means..."
    for predictor in to_impute:
      gm = gm.cbind(d.group_by(["time_period"], {predictor:["mean", d.names().index(predictor), "rm"]}, order_by = 0))
    gm.show()
    print "Saving the imputation means to disk..."
    h2o.download_csv(gm, filename = saving_means_fp)
    # df_py = h2o.as_list(gm)
    # Now that's stored for the holdout data, do this a faster way in java for the training data:
    for predictor in to_impute:
      d.impute(predictor, method='mean', by = ['time_period'], inplace = True)
      print "Done imputing", predictor
    print "Saving the final mean imputed data to disk..."
    h2o.export_file(frame = d, path =saving_meanImputed_fp, force=True)
  
  if method == "model":
    # sequentially impute 'newdata', not 'data', so the order of the predictor variables in the loop does not matter
    # otherwise, you would be using increasingly imputed data to make predictions as the loop progresses.
    newdata = d
    # With training data, build a model for each col and predict missing data, save the models, use them on the holdout data to predict all missing data.
    for predictor in to_impute:
      print "Building model for imputing " + predictor
      print "Subsetting the data into missing values for predictor and no missing values for predictor"
      na_ind = d[predictor].isna()
      not_na_ind = na_ind != 1.0
      to_train = d[not_na_ind]
      to_predict = d[na_ind]
      these_var = [var for var in predictors if var != predictor]
      trained = h2o.gbm(x = to_train[these_var],
                        y = to_train[[predictor]],
                        ntrees=300,
                        max_depth=6,
                        learn_rate=0.2)
      print "Saving the imputation tree model for " + predictor
      h2o.save_model(trained, dir = saving_models_fp, name = "dl_imputation_model_" + predictor)
      print "Imputing the missing " +  predictor + " data by predicting with the model..."
      predicted = trained.predict(to_predict[these_var])
      tofillin = newdata[predictor]
      assert len(predicted) == len(tofillin[na_ind])
      tofillin[na_ind] = predicted # mutate the column in place
      newdata[predictor] = tofillin
    
    print "Saving the final model-imputed data to disk..."
    h2o.export_file(frame = d, path =saving_modelImputed_fp, force=True)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000, 15001)), 1)[0]
    dataset_params['cols'] = random.sample(list(range(10, 21)), 1)[0]
    dataset_params['categorical_fraction'] = round(random.random(), 1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(
        left_over - round(random.uniform(0, left_over), 1), 1)
    if dataset_params['integer_fraction'] + dataset_params[
            'categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params[
                'categorical_fraction']:
            dataset_params[
                'integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params[
                'categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0, 0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2, 2000)
    dataset_params['response_factors'] = random.randint(3, 100)
    print("Dataset parameters: {0}".format(dataset_params))

    train = h2o.create_frame(**dataset_params)

    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(
        train, os.path.join(results_dir, "nb_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    params['laplace'] = 0
    if random.randint(0, 1): params['laplace'] = random.uniform(0, 11)
    print("Parameter list: {0}".format(params))

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="naive_bayes",
                             equality=None,
                             train=train,
                             test=None,
                             x=x,
                             y=y,
                             compile_only=True,
                             **params)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print("Dataset parameters: {0}".format(dataset_params))

    problem = random.sample(list(range(0,3)),1)
    print("Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}".format(problem))
    if   problem == 'binomial':    dataset_params['response_factors'] = 2
    elif problem == 'regression':  dataset_params['response_factors'] = 1
    else:                          dataset_params['response_factors'] = random.randint(3,100)


    train = h2o.create_frame(**dataset_params)
    if problem == 'binomial' or problem == 'multinomial': train['response'] = train['response'].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train["response"],os.path.join(results_dir,"drf_dynamic_preimputed_response.log"))
    train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(train,os.path.join(results_dir,"drf_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['ntrees'] = random.sample(list(range(1,21)),1)[0]
    if random.randint(0,1): params['max_depth'] = random.sample(list(range(1,11)),1)[0]
    if random.randint(0,1): params['min_rows'] = random.sample(list(range(1,11)),1)[0]
    if random.randint(0,1): params['nbins'] = random.sample(list(range(2,21)),1)[0]
    if random.randint(0,1): params['nbins_cats'] = random.sample(list(range(2,1025)),1)[0]
    if random.randint(0,1): params['mtries'] = random.sample(list(range(1,dataset_params['cols']+1)),1)[0]
    if random.randint(0,1): params['sample_rate'] = random.random()
    print("Parameter list: {0}".format(params))

    x = list(range(1,train.ncol))
    y = "response"

    pyunit_utils.javapredict(algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True,
                             **params)
def pubdev_1480():

    if not pyunit_utils.hadoop_namenode_is_accessible(): raise EnvironmentError
    train = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.train.gz")
    test = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.test.gz")

    model = h2o.gbm(x=train[list(range(2, 9))], y=train[1])

    predictions = model.predict(test)

    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(predictions, os.path.join(results_dir, "predictions.csv"))
def pubdev_1480():

    if not pyunit_utils.hadoop_namenode_is_accessible(): raise(EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")
    train = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.train.gz")
    test = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.test.gz")

    model = h2o.gbm(x=train[range(2,9)], y=train[1])

    predictions = model.predict(test)

    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(predictions, os.path.join(results_dir,"predictions.csv"))
Exemple #33
0
def make_predictions_and_save(classifier, test_data, output_file,
                              columns_offset):
    if path.exists(output_file) or sdir_exists(output_file):
        print 'already exists', output_file
        return
    if type(classifier) == str:
        classifier = load_h2o_model(classifier)
    if type(test_data) == str:
        test_data = load_h2o_data(test_data)
    predictions = classifier.predict(test_data[:, columns_offset:])
    if '/' not in output_file:
        output_file = sdir_path(output_file)
    h2o.download_csv(predictions, output_file)
def pubdev_1480():

    if not pyunit_utils.hadoop_namenode_is_accessible(): raise EnvironmentError
    train = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.train.gz")
    test = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.test.gz")

    model = H2OGradientBoostingEstimator()
    model.train(x=list(range(2, 9)), y=1, training_frame=train)

    predictions = model.predict(test)

    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(predictions, os.path.join(results_dir, "predictions.csv"))
def pubdev_1480():

    if not pyunit_utils.hadoop_namenode_is_accessible(): raise EnvironmentError
    train = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.train.gz")
    test = h2o.import_file("hdfs://mr-0xd6/datasets/kaggle/sf.crime.test.gz")

    model=H2OGradientBoostingEstimator()
    model.train(x=list(range(2,9)),y=1,training_frame=train)

    predictions = model.predict(test)

    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(predictions, os.path.join(results_dir,"predictions.csv"))
def get_glrm_xmatrix(train, test, K=3, compare_predict=True, tol=1e-1):
    x = train.names
    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types) - 1)]
    print("dataset transform is {0}.".format(transformN))
    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=K,
                                               transform=transformN,
                                               max_iterations=1000,
                                               seed=12345)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(
        glrmModel._model_json['output']['representation_name'])

    # assert glrmTrainFactor.nrows==train.nrows, \
    #     "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    mojoDir = save_GLRM_mojo(glrmModel)  # save mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    h2o.download_csv(test[x], os.path.join(
        mojoDir, 'in.csv'))  # save test file, h2o predict/mojo use same file

    frameID, mojoXFactor = pyunit_utils.mojo_predict(
        glrmModel, mojoDir, MOJONAME,
        glrmReconstruct=False)  # save mojo XFactor
    print("Comparing mojo x Factor and model x Factor ...")

    if transformN == "NONE" or not (
            compare_predict
    ):  # bad performance with no transformation on dataset
        pyunit_utils.check_data_rows(mojoXFactor,
                                     glrmTrainFactor,
                                     num_rows=mojoXFactor.nrow)
    else:
        pyunit_utils.compare_data_rows(mojoXFactor,
                                       glrmTrainFactor,
                                       index_list=range(
                                           2, mojoXFactor.nrows - 1),
                                       tol=tol)

    if compare_predict:  # only compare reconstructed data frames with numerical data
        pred2 = glrmModel.predict(test)  # predict using mojo
        pred1 = glrmModel.predict(
            train)  # predict using the X from A=X*Y from training

        predictDiff = pyunit_utils.compute_frame_diff(train, pred1)
        mojoDiff = pyunit_utils.compute_frame_diff(train, pred2)
        print(
            "absolute difference of mojo predict and original frame is {0} and model predict and original frame is {1}"
            .format(mojoDiff, predictDiff))
Exemple #37
0
def download_csv(ip, port):

    iris1 = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))

    h2o.download_csv(iris1, "iris_delete.csv")

    iris2 = h2o.import_file(path=h2o.locate("iris_delete.csv"))
    os.remove("iris_delete.csv")

    rand_row = random.randint(0, iris1.nrow - 1)
    rand_col = random.randint(0, 3)
    assert abs(iris1[rand_row, rand_col] - iris2[rand_row, rand_col]) < 1e-10, "Expected elements from the datasets to " \
                                                                                "be the same, but got {0} and {1}" \
                                                                                "".format(iris1[rand_row, rand_col],
                                                                                          iris2[rand_row, rand_col])
def load_feature_label_table(save_as_csv=False,femq12=None):
    """
    ultimate source of data is D:\data\PreliminaryAnalysis\BogusDealers\<dta & csv files>
    see also funcs from Shekhar's files:
    load_*() in D:\shekhar_code_github\BogusFirmCatching\init_sm.py
    load_everything() in D:\shekhar_code_github\BogusFirmCatching\ml_funcs.py
    """
    if femq12 is None:
        femq12 = load_everything()
    init()
    ffemq12 = load_h2odataframe_returns(femq12)
    if save_as_csv:
        h2o.download_csv(ffemq12,r"Z:\all_returns_features_minus_q12.csv")
    # TrainData, ValidData, TestData = divide_train_test(ffemq12)
    return femq12,ffemq12
def javapredict_dynamic_data():

    dataset_params = {}
    dataset_params["rows"] = 13183
    dataset_params["cols"] = 13
    dataset_params["categorical_fraction"] = 0.4
    dataset_params["integer_fraction"] = 0.3
    dataset_params["missing_fraction"] = 0.27539154084819495
    dataset_params["has_response"] = True
    dataset_params["randomize"] = True
    dataset_params["factors"] = 819
    print("Dataset parameters: {0}".format(dataset_params))

    problem = 2
    print("Model-building exercise (0:regression, 1:binomial, 2:multinomial): {0}".format(problem))
    if problem == "binomial":
        dataset_params["response_factors"] = 2
    elif problem == "regression":
        dataset_params["response_factors"] = 1
    else:
        dataset_params["response_factors"] = 16

    train = h2o.create_frame(**dataset_params)
    if problem == "binomial" or problem == "multinomial":
        train["response"] = train["response"].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train["response"], os.path.join(results_dir, "drf_dynamic_preimputed_response.log"))
    train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(train, os.path.join(results_dir, "drf_dynamic_training_dataset.log"))

    params = {}
    params["nbins"] = 5
    params["min_rows"] = 7
    params["mtries"] = 4
    params["sample_rate"] = 0.7867986759373544
    params["seed"] = 1304644573760597606
    print("Parameter list: {0}".format(params))

    x = list(range(1, train.ncol))
    y = "response"

    pyunit_utils.javapredict(
        algo="random_forest", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params
    )
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200  # number of test dataset rows
    df = pyunit_utils.random_dataset("regression",
                                     seed=1234)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transformN = "STANDARDIZE"

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3,
                                               transform=transformN,
                                               max_iterations=10,
                                               seed=1234)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(
        glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel)  # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    # test and make sure setting the iteration number did not screw up the prediction
    predID, pred_mojo = pyunit_utils.mojo_predict(
        glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100)  # save mojo predict
    pred_h2o = h2o.get_frame("GLRMLoading_" + predID)
    print("Comparing mojo x Factor and model x Factor for 100 iterations")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)

    # scoring with 2 iterations should be shorter than scoring with 8000 iterations
    starttime = time.time()
    runMojoPredictOnly(TMPDIR, MOJONAME,
                       glrmIterNumber=8000)  # save mojo predict
    time1000 = time.time() - starttime
    starttime = time.time()
    runMojoPredictOnly(TMPDIR, MOJONAME, glrmIterNumber=2)  # save mojo predict
    time10 = time.time() - starttime
    print(
        "Time taken for 2 iterations: {0}s.  Time taken for 8000 iterations: {1}s."
        .format(time10, time1000))
def runComparisonTests(autoEncoder, probleyType):
    params = set_params(autoEncoder)   # set deeplearning model parameters
    df = random_dataset(probleyType)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    deeplearningModel = build_save_model(params, x, train) # build and save mojo model
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearningModel, TMPDIR, MOJONAME)  # load model and perform predict
    pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
    h2o.save_model(deeplearningModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10)
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def drf_leaf_node_assignment_mojo_test():
    problems = ['binomial', 'multinomial', 'regression']
    PROBLEM = problems[randint(0, (len(problems) - 1))]
    TESTROWS = 2000
    df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS)
    train = df[TESTROWS:, :]
    test = df[:TESTROWS, :]
    x = list(set(df.names) - {"respose"})
    params = {'ntrees': 50, 'max_depth': 4}
    TMPDIR = tempfile.mkdtemp()
    my_gbm = pyunit_utils.build_save_model_generic(params, x, train, "response", "DRF", TMPDIR)
    MOJONAME = pyunit_utils.getMojoName(my_gbm._id)

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(my_gbm, TMPDIR, MOJONAME, get_leaf_node_assignment=True)  # load model and perform predict
    pyunit_utils.compare_string_frames_local(pred_h2o, pred_mojo, 0.5)
def pubdev_1431(ip, port):

    running_inside_h2o = tests.is_running_internal_to_h2o()

    if running_inside_h2o:
        hdfs_name_node = tests.get_h2o_internal_hdfs_name_node()
        airlines_billion_file = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file)
        airlines_billion = h2o.import_file(url)
        airlines_billion[30] = airlines_billion[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1)
        predictions = gbm.predict(airlines_billion)
        csv = os.path.join(os.getcwd(),"delete.csv")
        h2o.download_csv(predictions,csv)
        os.remove(csv)
    else:
        print "Not running on H2O internal network.  No access to HDFS."
def gbm_leaf_node_assignment_mojo_test():
    problems = ['binomial', 'multinomial', 'regression']
    PROBLEM = problems[randint(0, (len(problems) - 1))]
    TESTROWS = 2000
    df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS)
    train = df[TESTROWS:, :]
    test = df[:TESTROWS, :]
    x = list(set(df.names) - {"respose"})
    params = {'ntrees': 50, 'learn_rate': 0.1, 'max_depth': 4}

    my_gbm = pyunit_utils.build_save_model_GBM(params, x, train, "response")
    MOJONAME = pyunit_utils.getMojoName(my_gbm._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(my_gbm, TMPDIR, MOJONAME, get_leaf_node_assignment=True)  # load model and perform predict
    pyunit_utils.compare_string_frames_local(pred_h2o, pred_mojo, 0.5)
def download_csv(ip,port):
    
    

    iris1 = h2o.import_file(path=h2o.locate("smalldata/iris/iris.csv"))

    h2o.download_csv(iris1,"iris_delete.csv")

    iris2 = h2o.import_file(path=h2o.locate("iris_delete.csv"))
    os.remove("iris_delete.csv")

    rand_row = random.randint(0,iris1.nrow-1)
    rand_col = random.randint(0,3)
    assert abs(iris1[rand_row, rand_col] - iris2[rand_row, rand_col]) < 1e-10, "Expected elements from the datasets to " \
                                                                                "be the same, but got {0} and {1}" \
                                                                                "".format(iris1[rand_row, rand_col],
                                                                                          iris2[rand_row, rand_col])
def pubdev_1431():

    hadoop_namenode_is_accessible = tests.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = tests.hadoop_namenode()
        airlines_billion_file = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file)
        airlines_billion = h2o.import_file(url)
        airlines_billion[30] = airlines_billion[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1)
        predictions = gbm.predict(airlines_billion)
        csv = os.path.join(os.getcwd(),"delete.csv")
        h2o.download_csv(predictions,csv)
        os.remove(csv)
    else:
        raise(EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(range(5000,15001),1)[0]
    dataset_params['cols'] = random.sample(range(10,21),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print "Dataset parameters: {0}".format(dataset_params)

    train = h2o.create_frame(**dataset_params)

    print "Training dataset:"
    print train

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train,os.path.join(results_dir,"kmeans_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    params['k'] = random.sample(range(1,10),1)[0]
    if random.randint(0,1): params['max_iterations'] = random.sample(range(1,1000),1)[0]
    if random.randint(0,1): params['standardize'] = random.sample([True, False],1)[0]
    if random.randint(0,1): params['seed'] = random.sample(range(1,1000),1)[0]
    if random.randint(0,1): params['init'] = random.sample(['Random','PlusPlus','Furthest'],1)[0]
    print "Parameter list: {0}".format(params)

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="kmeans", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(100,200)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.01)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,50)
    print("Dataset parameters: {0}".format(dataset_params))

    train = h2o.create_frame(**dataset_params)

    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train,os.path.join(results_dir,"pca_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['max_iterations'] = random.sample(list(range(1,1000)),1)[0]
    if random.randint(0,1): params['transform'] = random.sample(["NONE","STANDARDIZE","NORMALIZE","DEMEAN","DESCALE"],1)[0]
    params['k'] = random.sample(list(range(1,min(train.ncol,train.nrow))),1)[0]

    print("Parameter list: {0}".format(params))

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="pca", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    dataset_params['response_factors'] = random.randint(3,100)
    print("Dataset parameters: {0}".format(dataset_params))

    train = h2o.create_frame(**dataset_params)

    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train,os.path.join(results_dir,"nb_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    params['laplace'] = 0
    if random.randint(0,1): params['laplace'] = random.uniform(0,11)
    print("Parameter list: {0}".format(params))

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="naive_bayes", equality=None, train=train, test=None, x=x, y=y, compile_only=True, **params)
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    df = pyunit_utils.random_dataset("regression", seed=1234)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict

    h2o.save_model(glrmModel, TMPDIR)   # save GLRM model
    glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME))
    predict_model = glrmModel2.predict(test)
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
            predict_model[col] = predict_model[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
    print("Comparing mojo predict and h2o predict from saved model...")
    pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10)
    frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor
    glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID)   # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def glm_multinomial_mojo_pojo():
    PROBLEM="multinomial"
    NTESTROWS=200
    params = set_params()   # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    glmMultinomialModel = pyunit_utils.build_save_model_GLM(params, x, train, "response") # build and save mojo model

    MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glmMultinomialModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR, MOJONAME)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10)    # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def javapredict_dynamic_data():

    # Generate random dataset
    dataset_params = {}
    dataset_params['rows'] = random.sample(list(range(5000,15001)),1)[0]
    dataset_params['cols'] = random.sample(list(range(10,21)),1)[0]
    dataset_params['categorical_fraction'] = round(random.random(),1)
    left_over = (1 - dataset_params['categorical_fraction'])
    dataset_params['integer_fraction'] = round(left_over - round(random.uniform(0,left_over),1),1)
    if dataset_params['integer_fraction'] + dataset_params['categorical_fraction'] == 1:
        if dataset_params['integer_fraction'] > dataset_params['categorical_fraction']:
            dataset_params['integer_fraction'] = dataset_params['integer_fraction'] - 0.1
        else:
            dataset_params['categorical_fraction'] = dataset_params['categorical_fraction'] - 0.1
    dataset_params['missing_fraction'] = random.uniform(0,0.5)
    dataset_params['has_response'] = True
    dataset_params['randomize'] = True
    dataset_params['factors'] = random.randint(2,2000)
    print("Dataset parameters: {0}".format(dataset_params))

    append_response = False
    distribution = random.sample(['bernoulli','multinomial','gaussian','poisson','gamma'], 1)[0]
    if   distribution == 'bernoulli': dataset_params['response_factors'] = 2
    elif distribution == 'gaussian':  dataset_params['response_factors'] = 1
    elif distribution == 'multinomial': dataset_params['response_factors'] = random.randint(3,100)
    else:
        dataset_params['has_response'] = False
        response = h2o.H2OFrame([[random.randint(1,1000)] for r in range(0,dataset_params['rows'])])
        append_response = True
    print("Distribution: {0}".format(distribution))

    train = h2o.create_frame(**dataset_params)
    if append_response:
        train = response.cbind(train)
        train.set_name(0,"response")
    if distribution == 'bernoulli' or distribution == 'multinomial': train['response'] = train['response'].asfactor()
    results_dir = pyunit_utils.locate("results")
    h2o.download_csv(train["response"],os.path.join(results_dir,"dl_dynamic_preimputed_response.log"))
    train.impute("response", method="mode")
    print("Training dataset:")
    print(train)

    # Save dataset to results directory
    h2o.download_csv(train,os.path.join(results_dir,"dl_dynamic_training_dataset.log"))

    # Generate random parameters
    params = {}
    if random.randint(0,1): params['activation'] = random.sample(["Rectifier", "Tanh", "TanhWithDropout",
                                                                  "RectifierWithDropout", "MaxoutWithDropout"],1)[0]
    if random.randint(0,1): params['epochs'] = random.sample(list(range(1,10)),1)[0]
    if random.randint(0,1):
        h = random.randint(10,21)
        params['hidden'] = [h for x in range(random.randint(2,3))]
    params['distribution'] = distribution
    params['l1'] = random.random()
    print("Parameter list: {0}".format(params))

    x = train.names
    x.remove("response")
    y = "response"

    pyunit_utils.javapredict(algo="deeplearning", equality=None, train=train, test=None, x=x, y=y, compile_only=True,
                             **params)
def javapredict(algo, equality, train, test, x, y, **kwargs):
    print "Creating model in H2O"
    if algo == "gbm":
        model = h2o.gbm(x=train[x], y=train[y], **kwargs)
    elif algo == "random_forest":
        model = h2o.random_forest(x=train[x], y=train[y], **kwargs)
    elif algo == "deeplearning":
        model = h2o.deeplearning(x=train[x], y=train[y], **kwargs)
    elif algo == "glm":
        model = h2o.glm(x=train[x], y=train[y], **kwargs)
    else:
        raise(ValueError, "algo {0} is not supported".format(algo))
    print model

    print "Downloading Java prediction model code from H2O"
    tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",model._id))
    os.mkdir(tmpdir)
    h2o.download_pojo(model,path=tmpdir)
    h2o_genmodel_jar = os.path.join(tmpdir,"h2o-genmodel.jar")
    assert os.path.exists(h2o_genmodel_jar), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar)
    print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar)
    java_file = os.path.join(tmpdir,model._id+".java")
    assert os.path.exists(java_file), "Expected file {0} to exist, but it does not.".format(java_file)
    print "java code saved in {0}".format(java_file)

    print "Predicting in H2O"
    predictions = model.predict(test)
    predictions.summary()
    predictions.head()
    out_h2o_csv = os.path.join(tmpdir,"out_h2o.csv")
    h2o.download_csv(predictions, out_h2o_csv)
    assert os.path.exists(out_h2o_csv), "Expected file {0} to exist, but it does not.".format(out_h2o_csv)
    print "H2O Predictions saved in {0}".format(out_h2o_csv)

    print "Setting up for Java POJO"
    in_csv = os.path.join(tmpdir,"in.csv")
    h2o.download_csv(test[x], in_csv)

    # hack: the PredictCsv driver can't handle quoted strings, so remove them
    f = open(in_csv, 'r+')
    csv = f.read()
    csv = re.sub('\"', '', csv)
    f.seek(0)
    f.write(csv)
    f.truncate()
    f.close()
    assert os.path.exists(in_csv), "Expected file {0} to exist, but it does not.".format(in_csv)
    print "Input CSV to PredictCsv saved in {0}".format(in_csv)

    print "Compiling Java Pojo"
    javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx4g", "-J-XX:MaxPermSize=256m", java_file]
    subprocess.check_call(javac_cmd)

    print "Running PredictCsv Java Program"
    out_pojo_csv = os.path.join(tmpdir,"out_pojo.csv")
    cp_sep = ";" if sys.platform == "win32" else ":"
    java_cmd = ["java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx4g", "-XX:MaxPermSize=256m",
                "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id,
                "--input", in_csv, "--output", out_pojo_csv]
    p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT)
    o, e = p.communicate()
    print "Java output: {0}".format(o)
    assert os.path.exists(out_pojo_csv), "Expected file {0} to exist, but it does not.".format(out_pojo_csv)
    predictions2 = h2o.import_file(path=out_pojo_csv)
    print "Pojo predictions saved in {0}".format(out_pojo_csv)

    print "Comparing predictions between H2O and Java POJO"
    # Dimensions
    hr, hc = predictions.dim
    pr, pc = predictions2.dim
    assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(hr, pr)
    assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(hc, pc)

    # Value
    for r in range(hr):
        hp = predictions[r,0]
        if equality == "numeric":
            pp = float.fromhex(predictions2[r,0])
            assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp)
        elif equality == "class":
            pp = predictions2[r,0]
            assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp)
        else:
            raise(ValueError, "equality type {0} is not supported".format(equality))
Exemple #54
0
def gen_data():
    floatA = []
    intA = []
    sizeMat = range(0,30)   # use to generate data of values 0, +/- 2^0 to +/1 2^64
    lowBoundF = -100000
    upperBoundF = -1*lowBoundF      # 2 million rows
    upperBoundL = pow(2,35)
    lowBoundL = upperBoundL-100000
    numZeros = 0
    numNans = 0   # generate Nans
    numInfs = 500
    numRep = 2    # number of times to repeat array
    csvFile = "/Users/wendycwong/temp/TopBottomNRep4.csv"
    fMult = 1.1

    fintA = []
    ffloatA = []
    for ind in range(0,1000):
        floatA = []
        intA = []
        genRandomData(intA,floatA, sizeMat)
        fintA.extend(intA)
        ffloatA.extend(floatA)


    shuffle(fintA)
    shuffle(ffloatA)
    bottom20FrameL = h2o.H2OFrame(python_obj=zip(fintA))
    bottom20FrameF = h2o.H2OFrame(python_obj=zip(ffloatA))
    h2o.download_csv(bottom20FrameL.cbind(bottom20FrameF), "/Users/wendycwong/temp/smallIntFloats.csv" )

    genStaticData(intA, floatA, upperBoundL, lowBoundF, upperBoundF, fMult)
    # save the correct sequence before shuffling for comparison purpose
    tempL = intA[0:int(round(len(intA)*0.2))]   # comes in decreasing value
    tempF = floatA[0:int(round(len(floatA)*0.2))]   # comes in decreasing value
    # save the correct sequence before shuffling for comparison purpose
    bottom20FrameL = h2o.H2OFrame(python_obj=zip(tempL))
    bottom20FrameF = h2o.H2OFrame(python_obj=zip(tempF))
    h2o.download_csv(bottom20FrameL.cbind(bottom20FrameF), "/Users/wendycwong/temp/Bottom20Per.csv" )

    tempL = intA[int(round(len(intA)*0.8)):len(intA)]
    tempL.sort()
    tempF = floatA[int(round(len(floatA)*0.8)):len(floatA)]
    tempF.sort()
    bottom20FrameL = h2o.H2OFrame(python_obj=zip(tempL))
    bottom20FrameF = h2o.H2OFrame(python_obj=zip(tempF))
    h2o.download_csv(bottom20FrameL.cbind(bottom20FrameF), "/Users/wendycwong/temp/Top20Per.csv" )


    # repeat the columns a few times to seriously test the algo with duplicated data.

    for val in range(0, numRep):
        intA.extend(intA)
        floatA.extend(floatA)

    shuffle(intA)   # randomly shuffle the indices
    shuffle(floatA) #

    intFrame = h2o.H2OFrame(python_obj=zip(intA))
    floatFrame = h2o.H2OFrame(python_obj=zip(floatA))
    h2o.download_csv(intFrame.cbind(floatFrame), csvFile)
 def download(_self, filename):
     h2o_frame = utils.Utils.dataframe_2_h2oframe(_self)
     h2o.download_csv(h2o_frame, filename)
Exemple #56
0
def test_mojo_model(target_dir):
    """
    Test the correctness of the "MOJO" model format.

    This test will create a random dataset, split into training/testing part, train a DRF model on it,
    download the model's MOJO, score the model remotely and fetch the predictions, score the model locally by
    running the genmodel jar, and finally compare the prediction results.
    """
    genmodel_jar = os.path.abspath("../../../h2o-genmodel/build/libs/h2o-genmodel-all.jar")
    assert os.path.exists(genmodel_jar), "Cannot find " + genmodel_jar

    report = []
    for estimator, estimator_name in [(H2ODeepWaterEstimator, "DeepWater"),
                                      (H2ORandomForestEstimator, "DRF"),
                                      (H2OGradientBoostingEstimator, "GBM")]:
        if (estimator == H2ODeepWaterEstimator and not H2ODeepWaterEstimator.available()): continue
        print(colorama.Fore.LIGHTYELLOW_EX + "\n#================================================")
        print("#  Estimator: " + estimator.__name__)
        print("#================================================\n" + colorama.Fore.RESET)

        for problem in ["binomial", "multinomial", "regression"]:
            print("========================")
            print("%s problem" % problem.capitalize())
            print("========================")
            if estimator == H2ODeepWaterEstimator and problem == "regression":
                print("Skipping %s" % problem.capitalize)
                continue
            df = random_dataset(problem, verbose=False)
            print("Created dataset with %d rows x %d columns" % (df.nrow, df.ncol))
            train = df[NTESTROWS:, :]
            test0 = df[0, :]
            test1 = df[:NTESTROWS, :]
            test2 = test1.rbind(test1)

            time0 = time.time()
            print("\n\nTraining %s model..." % estimator.__name__)
            if estimator == H2ODeepWaterEstimator:
                model = estimator(epochs=EPOCHS)  # , categorical_encoding="enum")
            else:
                model = estimator(ntrees=NTREES, max_depth=DEPTH)
            model.train(training_frame=train)
            print(model.summary())
            print("    Time taken = %.3fs" % (time.time() - time0))

            print("\nDownloading MOJO...")
            time0 = time.time()
            mojo_file = model.download_mojo(target_dir)
            print("    => %s  (%d bytes)" % (mojo_file, os.stat(mojo_file).st_size))
            assert os.path.exists(mojo_file)
            print("    Time taken = %.3fs" % (time.time() - time0))

            if estimator != H2ODeepWaterEstimator:
                print("\nDownloading POJO...")
                time0 = time.time()
                pojo_file = model.download_pojo(target_dir)
                pojo_size = os.stat(pojo_file).st_size
                pojo_name = os.path.splitext(os.path.basename(pojo_file))[0]
                print("    => %s  (%d bytes)" % (pojo_file, pojo_size))
                print("    Time taken = %.3fs" % (time.time() - time0))

            print("\nDownloading the test datasets for local use: ")
            time0 = time.time()
            test0_file = os.path.join(target_dir, "test0_%s.csv" % test0.frame_id)
            test1_file = os.path.join(target_dir, "test1_%s.csv" % test1.frame_id)
            test2_file = os.path.join(target_dir, "test2_%s.csv" % test2.frame_id)
            print("    => " + test0_file)
            print("    => " + test1_file)
            print("    => " + test2_file)
            h2o.download_csv(test0, test0_file)
            h2o.download_csv(test1, test1_file)
            h2o.download_csv(test2, test2_file)
            print("    Time taken = %.3fs" % (time.time() - time0))

            print("\nScoring the model remotely and downloading to files...")
            times = []
            h2o_pred_file0 = os.path.join(target_dir, "predR_%s.csv" % test0.frame_id)
            h2o_pred_file1 = os.path.join(target_dir, "predR_%s.csv" % test1.frame_id)
            h2o_pred_file2 = os.path.join(target_dir, "predR_%s.csv" % test2.frame_id)
            for testframe, outfile in [(test0, h2o_pred_file0), (test1, h2o_pred_file1), (test2, h2o_pred_file2)]:
                predictions = model.predict(testframe)
                h2o.download_csv(predictions, outfile)
                print("    => " + outfile)
                times.append(time.time())
            print("    Time taken = %.3fs   (1st run: %.3f, 2nd run: %.3f)" %
                  (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1]))
            report.append((estimator_name, problem, "Server", times[1] - times[0], times[2] - times[1]))

            print("\nScoring the model locally and saving to files... ")
            times = []
            local_pred_file0 = os.path.join(target_dir, "predL_%s.csv" % test0.frame_id)
            local_pred_file1 = os.path.join(target_dir, "predL_%s.csv" % test1.frame_id)
            local_pred_file2 = os.path.join(target_dir, "predL_%s.csv" % test2.frame_id)
            for inpfile, outfile in [(test0_file, local_pred_file0), (test1_file, local_pred_file1),
                                     (test2_file, local_pred_file2)]:
                load_csv(inpfile)
                java_cmd = ["java", "-cp", genmodel_jar,
                            "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m",
                            "hex.genmodel.tools.PredictCsv",
                            "--input", inpfile, "--output", outfile, "--mojo", mojo_file, "--decimal"]
                print("    %r" % java_cmd)
                ret = subprocess.call(java_cmd)
                assert ret == 0, "GenModel finished with return code %d" % ret
                print("    => " + local_pred_file1)
                times.append(time.time())
            print("    Time taken = %.3fs   (1st run: %.3f, 2nd run: %.3f)" %
                  (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1]))
            report.append((estimator_name, problem, "Mojo", times[1] - times[0], times[2] - times[1]))

            if estimator != H2ODeepWaterEstimator and pojo_size <= 1000 << 20:  # 1000 Mb
                time0 = time.time()
                print("\nCompiling Java Pojo")
                javac_cmd = ["javac", "-cp", genmodel_jar, "-J-Xmx12g", pojo_file]
                subprocess.check_call(javac_cmd)
                print("    Time taken = %.3fs" % (time.time() - time0))

                pojo_pred_file0 = os.path.join(target_dir, "predP_%s.csv" % test0.frame_id)
                pojo_pred_file1 = os.path.join(target_dir, "predP_%s.csv" % test1.frame_id)
                pojo_pred_file2 = os.path.join(target_dir, "predP_%s.csv" % test2.frame_id)
                print("\nScoring POJO and saving to file...")
                times = []
                cp_sep = ";" if sys.platform == "win32" else ":"
                for inpfile, outfile in [(test0_file, pojo_pred_file0), (test1_file, pojo_pred_file1),
                                         (test2_file, pojo_pred_file2)]:
                    load_csv(inpfile)
                    java_cmd = ["java", "-cp", cp_sep.join([genmodel_jar, target_dir]),
                                "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", "-XX:MaxPermSize=256m",
                                "hex.genmodel.tools.PredictCsv",
                                "--pojo", pojo_name, "--input", inpfile, "--output", outfile, "--decimal"]
                    print("    %r" % java_cmd)
                    ret = subprocess.call(java_cmd)
                    assert ret == 0, "GenModel finished with return code %d" % ret
                    times.append(time.time())
                print("    Time taken = %.3fs   (1st run: %.3f, 2nd run: %.3f)" %
                      (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1]))
                report.append((estimator_name, problem, "POJO", times[1] - times[0], times[2] - times[1]))
            else:
                pojo_pred_file1 = None


            print("\nChecking whether the predictions coincide...")
            time0 = time.time()
            local_pred = load_csv(local_pred_file1)
            server_pred = load_csv(h2o_pred_file1)
            pojo_pred = load_csv(pojo_pred_file1) if pojo_pred_file1 else local_pred
            assert len(local_pred) == len(server_pred) == len(pojo_pred) == test1.nrow, \
                "Number of rows in prediction files do not match: %d vs %d vs %d vs %d" % \
                (len(local_pred), len(server_pred), len(pojo_pred), test1.nrow)

            for i in range(test1.nrow):
                lpred = local_pred[i]
                rpred = server_pred[i]
                ppred = pojo_pred[i]
                assert type(lpred) == type(rpred) == type(ppred), \
                    "Types of predictions do not match: %r / %r / %r" % (lpred, rpred, ppred)
                if isinstance(lpred, float):
                    same = abs(lpred - rpred) + abs(lpred - ppred) <= 1e-8 * (abs(lpred) + abs(rpred) + abs(ppred))
                else:
                    same = lpred == rpred == ppred
                assert same, \
                    "Predictions are different for row %d: mojo=%r, pojo=%r, server=%r" % (i + 1, lpred, ppred, rpred)
            print("    Time taken = %.3fs" % (time.time() - time0))
            print(colorama.Fore.LIGHTGREEN_EX + "\nPredictions match!\n" + colorama.Fore.RESET)

    print(colorama.Fore.LIGHTYELLOW_EX + "\n\n#================================================")
    print("#  Timing report")
    print("#================================================\n" + colorama.Fore.RESET)
    print(tabulate.tabulate(report,
          headers=["Model", "Problem type", "Scorer", "%d rows" % NTESTROWS, "%d rows" % (2 * NTESTROWS)],
          floatfmt=".3f"), end="\n\n\n")
Exemple #57
0
def javamunge(assembly, pojoname, test, compile_only=False):
    """
    Here's how to use:
      assembly is an already fit H2OAssembly;
      The test set should be used to compare the output here and the output of the POJO.
    """
    print("Downloading munging POJO code from H2O")
    tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results", pojoname))
    os.mkdir(tmpdir)
    assembly.to_pojo(pojoname, path=tmpdir, get_jar=True)
    h2o_genmodel_jar = os.path.join(tmpdir,"h2o-genmodel.jar")
    assert os.path.exists(h2o_genmodel_jar), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar)
    print("h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar))
    java_file = os.path.join(tmpdir,pojoname+".java")
    assert os.path.exists(java_file), "Expected file {0} to exist, but it does not.".format(java_file)
    print("java code saved in {0}".format(java_file))

    print("Compiling Java Pojo")
    javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g", "-J-XX:MaxPermSize=256m", java_file]
    subprocess.check_call(javac_cmd)

    if not compile_only:

        print("Setting up for Java POJO")
        in_csv = os.path.join(tmpdir,"in.csv")
        h2o.download_csv(test, in_csv)
        assert os.path.exists(in_csv), "Expected file {0} to exist, but it does not.".format(in_csv)
        print("Input CSV to mungedCSV saved in {0}".format(in_csv))

        print("Predicting in H2O")
        munged = assembly.fit(test)
        munged.head()
        out_h2o_csv = os.path.join(tmpdir,"out_h2o.csv")
        h2o.download_csv(munged, out_h2o_csv)
        assert os.path.exists(out_h2o_csv), "Expected file {0} to exist, but it does not.".format(out_h2o_csv)
        print("Munged frame saved in {0}".format(out_h2o_csv))

        print("Running PredictCsv Java Program")
        out_pojo_csv = os.path.join(tmpdir,"out_pojo.csv")
        cp_sep = ";" if sys.platform == "win32" else ":"
        java_cmd = ["java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx12g", "-XX:MaxPermSize=2g",
                    "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.MungeCsv", "--header", "--munger", pojoname,
                    "--input", in_csv, "--output", out_pojo_csv]
        print("JAVA COMMAND: " + " ".join(java_cmd))
        p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT)
        o, e = p.communicate()
        print("Java output: {0}".format(o))
        assert os.path.exists(out_pojo_csv), "Expected file {0} to exist, but it does not.".format(out_pojo_csv)
        munged2 = h2o.upload_file(path=out_pojo_csv)
        print("Pojo predictions saved in {0}".format(out_pojo_csv))

        print("Comparing predictions between H2O and Java POJO")
        # Dimensions
        hr, hc = munged.dim
        pr, pc = munged2.dim
        assert hr == pr, "Expected the same number of rows, but got {0} and {1}".format(hr, pr)
        assert hc == pc, "Expected the same number of cols, but got {0} and {1}".format(hc, pc)

        # Value
        import math
        munged.show()
        munged2.show()
        for r in range(hr):
          for c in range(hc):
              hp = munged[r,c]
              pp = munged2[r,c]
              if isinstance(hp, float):
                assert isinstance(pp, float)
                assert (math.fabs(hp-pp) < 1e-8) or (math.isnan(hp) and math.isnan(pp)), "Expected munged rows to be the same for row {0}, but got {1}, and {2}".format(r, hp, pp)
              else:
                assert hp == pp, "Expected munged rows to be the same for row {0}, but got {1}, and {2}".format(r, hp, pp)
Exemple #58
0
def javapredict(algo, equality, train, test, x, y, compile_only=False, **kwargs):
    print("Creating model in H2O")
    if algo == "gbm": model = H2OGradientBoostingEstimator(**kwargs)
    elif algo == "random_forest": model = H2ORandomForestEstimator(**kwargs)
    elif algo == "deeplearning": model = H2ODeepLearningEstimator(**kwargs)
    elif algo == "glm": model = H2OGeneralizedLinearEstimator(**kwargs)
    elif algo == "naive_bayes": model = H2ONaiveBayesEstimator(**kwargs)
    elif algo == "kmeans": model = H2OKMeansEstimator(**kwargs)
    elif algo == "pca": model = H2OPCA(**kwargs)
    else: raise ValueError
    if algo == "kmeans" or algo == "pca": model.train(x=x, training_frame=train)
    else: model.train(x=x, y=y, training_frame=train)
    print(model)

    # HACK: munge model._id so that it conforms to Java class name. For example, change K-means to K_means.
    # TODO: clients should extract Java class name from header.
    regex = re.compile("[+\\-* !@#$%^&()={}\\[\\]|;:'\"<>,.?/]")
    pojoname = regex.sub("_",model._id)

    print("Downloading Java prediction model code from H2O")
    tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",pojoname))
    os.mkdir(tmpdir)
    h2o.download_pojo(model,path=tmpdir)
    h2o_genmodel_jar = os.path.join(tmpdir,"h2o-genmodel.jar")
    assert os.path.exists(h2o_genmodel_jar), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar)
    print("h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar))
    java_file = os.path.join(tmpdir,pojoname+".java")
    assert os.path.exists(java_file), "Expected file {0} to exist, but it does not.".format(java_file)
    print("java code saved in {0}".format(java_file))

    print("Compiling Java Pojo")
    javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g", "-J-XX:MaxPermSize=256m", java_file]
    subprocess.check_call(javac_cmd)

    if not compile_only:
        print("Predicting in H2O")
        predictions = model.predict(test)
        predictions.summary()
        predictions.head()
        out_h2o_csv = os.path.join(tmpdir,"out_h2o.csv")
        h2o.download_csv(predictions, out_h2o_csv)
        assert os.path.exists(out_h2o_csv), "Expected file {0} to exist, but it does not.".format(out_h2o_csv)
        print("H2O Predictions saved in {0}".format(out_h2o_csv))

        print("Setting up for Java POJO")
        in_csv = os.path.join(tmpdir,"in.csv")
        h2o.download_csv(test[x], in_csv)

        # hack: the PredictCsv driver can't handle quoted strings, so remove them
        f = open(in_csv, 'r+')
        csv = f.read()
        csv = re.sub('\"', '', csv)
        f.seek(0)
        f.write(csv)
        f.truncate()
        f.close()
        assert os.path.exists(in_csv), "Expected file {0} to exist, but it does not.".format(in_csv)
        print("Input CSV to PredictCsv saved in {0}".format(in_csv))

        print("Running PredictCsv Java Program")
        out_pojo_csv = os.path.join(tmpdir,"out_pojo.csv")
        cp_sep = ";" if sys.platform == "win32" else ":"
        java_cmd = ["java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx12g", "-XX:MaxPermSize=2g",
                    "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", pojoname,
                    "--input", in_csv, "--output", out_pojo_csv]
        p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT)
        o, e = p.communicate()
        print("Java output: {0}".format(o))
        assert os.path.exists(out_pojo_csv), "Expected file {0} to exist, but it does not.".format(out_pojo_csv)
        predictions2 = h2o.upload_file(path=out_pojo_csv)
        print("Pojo predictions saved in {0}".format(out_pojo_csv))

        print("Comparing predictions between H2O and Java POJO")
        # Dimensions
        hr, hc = predictions.dim
        pr, pc = predictions2.dim
        assert hr == pr, "Expected the same number of rows, but got {0} and {1}".format(hr, pr)
        assert hc == pc, "Expected the same number of cols, but got {0} and {1}".format(hc, pc)

        # Value
        for r in range(hr):
            hp = predictions[r,0]
            if equality == "numeric":
                pp = float.fromhex(predictions2[r,0])
                assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp)
            elif equality == "class":
                pp = predictions2[r,0]
                assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp)
            else:
                raise ValueError