def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    df = pyunit_utils.random_dataset("regression")       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]
    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)

    frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor
    glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID)   # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
Ejemplo n.º 2
0
def glrm_mojo():
    h2o.remove_all()
    train = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_test.csv"))
    predict_10iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_10iter.csv"))
    predict_1iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_1iter.csv"))

    x = train.names
    transformN = "STANDARDIZE"

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234, init="random")
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # save mojo model
    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    # test and make sure setting the iteration number did not screw up the prediction
    predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100) # save mojo predict
    pred_h2o = h2o.get_frame("GLRMLoading_"+predID)
    print("Comparing mojo x Factor and model x Factor for 100 iterations")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
    predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=1) # save mojo predict
    print("Comparing mojo x Factor and model x Factor for 1 iterations")
    pyunit_utils.compare_frames_local(predict_1iter, pred_mojo, 1, tol=1e-10)
    predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=10) # save mojo predict
    print("Comparing mojo x Factor and model x Factor for 10 iterations")
    pyunit_utils.compare_frames_local(predict_10iter, pred_mojo, 1, tol=1e-10)
def custom_distribution_mojo_test():
    rows = 2000
    df = random_dataset('binomial', verbose=False, NTESTROWS=rows)
    df['response'] = df['response'].asnumeric()
    train = df[rows:, :]
    test = df[:rows, :]
    x = list(set(df.names) - {"response"})

    params = {
        'ntrees': 10,
        'max_depth': 4,
        'distribution': "custom",
        'custom_distribution_func': custom_distribution_bernoulli()
    }

    my_gbm = build_save_model_GBM(params, x, train, "response")
    mojo_name = getMojoName(my_gbm._id)
    tmp_dir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", mojo_name))

    h2o.download_csv(test[x], os.path.join(
        tmp_dir, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = mojo_predict(
        my_gbm, tmp_dir, mojo_name)  # load model and perform predict
    assert compare_frames_local(
        pred_h2o, pred_mojo, returnResult=True
    ), "Predictions from model and MOJO model are not the same."
def glm_multinomial_mojo_pojo():
    PROBLEM = "multinomial"
    NTESTROWS = 200
    params = set_params()  # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    glmMultinomialModel = pyunit_utils.build_save_model_GLM(
        params, x, train, "response")  # build and save mojo model

    MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glmMultinomialModel, TMPDIR,
        MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR,
                                          MOJONAME)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def deeplearning_mojo_pojo():
    h2o.remove_all()

    params = set_params()   # set deeplearning model parameters
    df = random_dataset(PROBLEM)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    try:
        deeplearningModel = build_save_model(params, x, train) # build and save mojo model
        h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
        pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearningModel, TMPDIR, MOJONAME)  # load model and perform predict
        pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
        h2o.save_model(deeplearningModel, path=TMPDIR, force=True)  # save model for debugging
        print("Comparing mojo predict and h2o predict...")
        pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-10)    # make sure operation sequence is preserved from Tomk
        print("Comparing pojo predict and h2o predict...")
        pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-10)
    except Exception as ex:
        print("***************  ERROR and type is ")
        print(str(type(ex)))
        print(ex)
        if "AssertionError" in str(type(ex)):   # only care if there is an AssertionError, ignore the others
            sys.exit(1)
def runComparisonTests(autoEncoder, actFun, missingValuesHandling,
                       setAllFactor, train, test, x):
    params = set_params(actFun, missingValuesHandling, setAllFactor,
                        autoEncoder)  # set deeplearning model parameters

    if autoEncoder:
        try:
            deeplearningModel = build_save_model(
                params, x, train)  # build and save mojo model
        except Exception as err:
            if not ("Trying to predict with an unstable model" in err.args[0]):
                raise Exception(
                    'Deeplearning autoencoder model failed to build.  Fix it.')
            return
    else:
        deeplearningModel = build_save_model(
            params, x, train)  # build and save mojo model

    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        deeplearningModel, TMPDIR, MOJONAME)  # load model and perform predict
    pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
    h2o.save_model(deeplearningModel, path=TMPDIR,
                   force=True)  # save model for debugging
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o,
                                                   pred_mojo,
                                                   prob=1,
                                                   tol=1e-10)
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo,
                                                   pred_pojo,
                                                   prob=1,
                                                   tol=1e-10)
Ejemplo n.º 7
0
def glm_binomial_mojo_pojo():
    h2o.remove_all()
    NTESTROWS = 200  # number of test dataset rows
    PROBLEM = "binomial"
    params = set_params()  # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})
    TMPDIR = tempfile.mkdtemp()
    glmBinomialModel = pyunit_utils.build_save_model_generic(
        params, x, train, "response", "glm",
        TMPDIR)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(glmBinomialModel._id)

    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glmBinomialModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmBinomialModel, TMPDIR, MOJONAME)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def glm_gamma_offset_mojo():
    train = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    y = "DPROS"
    x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL"]
    x_offset = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "C1"]
    params = {'family': "gamma", 'offset_column': "C1"}
    offset = pyunit_utils.random_dataset_real_only(train.nrow,
                                                   1,
                                                   realR=3,
                                                   misFrac=0,
                                                   randSeed=12345)
    train = train.cbind(offset)

    tmpdir = tempfile.mkdtemp()
    glm_gamma_model = pyunit_utils.build_save_model_generic(
        params, x, train, y, "glm", tmpdir)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(glm_gamma_model._id)

    h2o.download_csv(train[x_offset], os.path.join(
        tmpdir, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glm_gamma_model, tmpdir, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(tmpdir, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10)  # compare mojo and model predict
def run_comparison_tests(auto_encoder, act_fun, missing_values_handling, set_all_factor, train, test, x):
    # set deeplearning model parameters
    params = set_params(act_fun, missing_values_handling, set_all_factor, auto_encoder) 
    
    if auto_encoder:
        try:
            # build and save mojo model
            deeplearning_model = build_save_model(params, x, train) 
        except Exception as err:
            if not("Trying to predict with an unstable model" in err.args[0]):
                raise Exception('Deeplearning autoencoder model failed to build.  Fix it.')
            return
    else:
        # build and save mojo model
        deeplearning_model = build_save_model(params, x, train) 

    # save test file, h2o predict/mojo use same file
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  
    # load model and perform predict
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearning_model, TMPDIR, MOJONAME)  
    pred_pojo = pyunit_utils.pojo_predict(deeplearning_model, TMPDIR, MOJONAME)
    # save model for debugging
    h2o.save_model(deeplearning_model, path=TMPDIR, force=True)  
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10)
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def gam_gaussian_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    PROBLEM="gaussian"
    params = set_params()   # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM, missing_fraction=0.001)   # generate random dataset
    dfnames = df.names
    # add GAM specific parameters
    params["gam_columns"] = []
    params["scale"] = []
    count = 0
    num_gam_cols = 3    # maximum number of gam columns
    for cname in dfnames:
        if not(cname == 'response') and (str(df.type(cname)) == "real"):
            params["gam_columns"].append(cname)
            params["scale"].append(0.001)
            count = count+1
            if (count >= num_gam_cols):
                break
    
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    TMPDIR = tempfile.mkdtemp()
    gamGaussianModel = pyunit_utils.build_save_model_generic(params, x, train, "response", "gam", TMPDIR) # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(gamGaussianModel._id)
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(gamGaussianModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10)    # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
Ejemplo n.º 11
0
def gam_binomial_mojo():
    params = set_params()
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    test = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    train["C21"] = train["C21"].asfactor()
    test["C21"] = test["C21"].asfactor()
    x = ["C1"]
    y = "C21"

    TMPDIR = tempfile.mkdtemp()
    gamModel = pyunit_utils.build_save_model_generic(
        params, x, train, y, "gam", TMPDIR)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(gamModel._id)

    h2o.download_csv(test, os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        gamModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
Ejemplo n.º 12
0
def deeplearning_mojo_pojo():
    h2o.remove_all()

    params = set_params()  # set deeplearning model parameters
    df = random_dataset(PROBLEM)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    try:
        deeplearningModel = build_save_model(
            params, x, train)  # build and save mojo model
        h2o.download_csv(test[x], os.path.join(
            TMPDIR,
            'in.csv'))  # save test file, h2o predict/mojo use same file
        pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
            deeplearningModel, TMPDIR,
            MOJONAME)  # load model and perform predict
        # pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
        h2o.save_model(deeplearningModel, path=TMPDIR,
                       force=True)  # save model for debugging
        print("Comparing mojo predict and h2o predict...")
        pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-6)
    #  print("Comparing pojo predict and h2o predict...")
    #  pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-6)
    except Exception as ex:
        print("***************  ERROR and type is ")
        print(str(type(ex)))
        print(ex)
        if "AssertionError" in str(
                type(ex)
        ):  # only care if there is an AssertionError, ignore the others
            sys.exit(1)
def glm_fractional_binomial_mojo_pojo():
    params = set_params()
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/fraction_binommialOrig.csv"))
    test = h2o.import_file(
        pyunit_utils.locate("smalldata/glm_test/fraction_binommialOrig.csv"))
    x = ["log10conc"]
    y = "y"

    glmModel = pyunit_utils.build_save_model_GLM(
        params, x, train, y)  # build and save mojo model

    MOJONAME = pyunit_utils.getMojoName(glmModel._id)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glmModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmModel, TMPDIR, MOJONAME)
    pred_h2o = pred_h2o.drop(3)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    df = pyunit_utils.random_dataset("regression", seed=1234)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict

    h2o.save_model(glrmModel, TMPDIR)   # save GLRM model
    glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME))
    predict_model = glrmModel2.predict(test)
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
            predict_model[col] = predict_model[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
    print("Comparing mojo predict and h2o predict from saved model...")
    pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10)
    frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor
    glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID)   # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def get_glrm_xmatrix(train, test, K=3, compare_predict=True, tol=1e-1):
    x = train.names
    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types) - 1)]
    print("dataset transform is {0}.".format(transformN))
    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=K,
                                               transform=transformN,
                                               max_iterations=1000,
                                               seed=12345)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(
        glrmModel._model_json['output']['representation_name'])

    # assert glrmTrainFactor.nrows==train.nrows, \
    #     "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    mojoDir = save_GLRM_mojo(glrmModel)  # save mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    h2o.download_csv(test[x], os.path.join(
        mojoDir, 'in.csv'))  # save test file, h2o predict/mojo use same file

    frameID, mojoXFactor = pyunit_utils.mojo_predict(
        glrmModel, mojoDir, MOJONAME,
        glrmReconstruct=False)  # save mojo XFactor
    print("Comparing mojo x Factor and model x Factor ...")

    if transformN == "NONE" or not (
            compare_predict
    ):  # bad performance with no transformation on dataset
        pyunit_utils.check_data_rows(mojoXFactor,
                                     glrmTrainFactor,
                                     num_rows=mojoXFactor.nrow)
    else:
        pyunit_utils.compare_data_rows(mojoXFactor,
                                       glrmTrainFactor,
                                       index_list=range(
                                           2, mojoXFactor.nrows - 1),
                                       tol=tol)

    if compare_predict:  # only compare reconstructed data frames with numerical data
        pred2 = glrmModel.predict(test)  # predict using mojo
        pred1 = glrmModel.predict(
            train)  # predict using the X from A=X*Y from training

        predictDiff = pyunit_utils.compute_frame_diff(train, pred1)
        mojoDiff = pyunit_utils.compute_frame_diff(train, pred2)
        print(
            "absolute difference of mojo predict and original frame is {0} and model predict and original frame is {1}"
            .format(mojoDiff, predictDiff))
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200  # number of test dataset rows
    df = pyunit_utils.random_dataset("regression",
                                     seed=1234)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transformN = "STANDARDIZE"

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3,
                                               transform=transformN,
                                               max_iterations=10,
                                               seed=1234)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(
        glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel)  # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    # test and make sure setting the iteration number did not screw up the prediction
    predID, pred_mojo = pyunit_utils.mojo_predict(
        glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100)  # save mojo predict
    pred_h2o = h2o.get_frame("GLRMLoading_" + predID)
    print("Comparing mojo x Factor and model x Factor for 100 iterations")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)

    # scoring with 2 iterations should be shorter than scoring with 8000 iterations
    starttime = time.time()
    runMojoPredictOnly(TMPDIR, MOJONAME,
                       glrmIterNumber=8000)  # save mojo predict
    time1000 = time.time() - starttime
    starttime = time.time()
    runMojoPredictOnly(TMPDIR, MOJONAME, glrmIterNumber=2)  # save mojo predict
    time10 = time.time() - starttime
    print(
        "Time taken for 2 iterations: {0}s.  Time taken for 8000 iterations: {1}s."
        .format(time10, time1000))
def drf_leaf_node_assignment_mojo_test():
    problems = ['binomial', 'multinomial', 'regression']
    PROBLEM = problems[randint(0, (len(problems) - 1))]
    TESTROWS = 2000
    df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS)
    train = df[TESTROWS:, :]
    test = df[:TESTROWS, :]
    x = list(set(df.names) - {"respose"})
    params = {'ntrees': 50, 'max_depth': 4}
    TMPDIR = tempfile.mkdtemp()
    my_gbm = pyunit_utils.build_save_model_generic(params, x, train, "response", "DRF", TMPDIR)
    MOJONAME = pyunit_utils.getMojoName(my_gbm._id)

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(my_gbm, TMPDIR, MOJONAME, get_leaf_node_assignment=True)  # load model and perform predict
    pyunit_utils.compare_string_frames_local(pred_h2o, pred_mojo, 0.5)
def runComparisonTests(autoEncoder, probleyType):
    params = set_params(autoEncoder)   # set deeplearning model parameters
    df = random_dataset(probleyType)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    deeplearningModel = build_save_model(params, x, train) # build and save mojo model
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearningModel, TMPDIR, MOJONAME)  # load model and perform predict
    pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
    h2o.save_model(deeplearningModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10)
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def gbm_leaf_node_assignment_mojo_test():
    problems = ['binomial', 'multinomial', 'regression']
    PROBLEM = problems[randint(0, (len(problems) - 1))]
    TESTROWS = 2000
    df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS)
    train = df[TESTROWS:, :]
    test = df[:TESTROWS, :]
    x = list(set(df.names) - {"respose"})
    params = {'ntrees': 50, 'learn_rate': 0.1, 'max_depth': 4}

    my_gbm = pyunit_utils.build_save_model_GBM(params, x, train, "response")
    MOJONAME = pyunit_utils.getMojoName(my_gbm._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(my_gbm, TMPDIR, MOJONAME, get_leaf_node_assignment=True)  # load model and perform predict
    pyunit_utils.compare_string_frames_local(pred_h2o, pred_mojo, 0.5)
Ejemplo n.º 20
0
def pca_mojo():
    h2o.remove_all()
    NTESTROWS = 200  # number of test dataset rows
    df = pyunit_utils.random_dataset("regression",
                                     ncol_upper=8000,
                                     ncol_lower=5000,
                                     missing_fraction=0.001,
                                     seed=1234)
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names
    transform_types = [
        "NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"
    ]  # pyunit test loop through transform
    for transformN in transform_types:  # compare H2O predict and mojo predict for all dataset transform types
        pcaModel = H2OPrincipalComponentAnalysisEstimator(
            k=3,
            transform=transformN,
            seed=1234,
            impute_missing=True,
            use_all_factor_levels=False)
        pcaModel.train(x=x, training_frame=train)
        pyunit_utils.saveModelMojo(pcaModel)  # save mojo model
        MOJONAME = pyunit_utils.getMojoName(pcaModel._id)
        TMPDIR = os.path.normpath(
            os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                         "results", MOJONAME))
        h2o.download_csv(test[x], os.path.join(
            TMPDIR,
            'in.csv'))  # save test file, h2o predict/mojo use same file
        pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
            pcaModel, TMPDIR, MOJONAME)  # save mojo predict

        for col in range(pred_h2o.ncols):
            if pred_h2o[col].isfactor():
                pred_h2o[col] = pred_h2o[col].asnumeric()

        print("Comparing mojo predict and h2o predict...")
        pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
def glm_multinomial_mojo_pojo():
    PROBLEM="multinomial"
    NTESTROWS=200
    params = set_params()   # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    glmMultinomialModel = pyunit_utils.build_save_model_GLM(params, x, train, "response") # build and save mojo model

    MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glmMultinomialModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR, MOJONAME)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10)    # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def gam_gaussian_mojo():
    h2o.remove_all()
    NTESTROWS = 200  # number of test dataset rows
    PROBLEM = "gaussian"
    params = set_params()
    df = pyunit_utils.random_dataset(PROBLEM, seed=2, missing_fraction=0.5)
    dfnames = df.names

    # add GAM specific parameters
    params["gam_columns"] = []
    params["scale"] = []
    count = 0
    num_gam_cols = 3  # maximum number of gam columns
    for cname in dfnames:
        if not (cname == 'response') and (str(df.type(cname)) == "real"):
            params["gam_columns"].append(cname)
            params["scale"].append(0.001)
            count = count + 1
            if count >= num_gam_cols:
                break

    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    exclude_list = {"response", params["gam_columns"][0]}
    x = list(set(df.names) - exclude_list)

    TMPDIR = tempfile.mkdtemp()
    gamGaussianModel = pyunit_utils.build_save_model_generic(
        params, x, train, "response", "gam",
        TMPDIR)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(gamGaussianModel._id)
    h2o.download_csv(test, os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        gamGaussianModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
def test_negativebinomial_GAM_MOJO():
    print("Read in prostate data.")
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    test = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    print("Testing for family: Negative Binomial")
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    params = set_params()
    TMPDIR = tempfile.mkdtemp()
    gamModel = pyunit_utils.build_save_model_generic(
        params, myX, h2o_data, "GLEASON", "gam",
        TMPDIR)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(gamModel._id)

    h2o.download_csv(test[myX], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        gamModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
def runComparisonTests(autoEncoder, probleyType):
    params = set_params(autoEncoder)  # set deeplearning model parameters
    df = random_dataset(probleyType)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    if autoEncoder:
        try:
            deeplearningModel = build_save_model(
                params, x, train)  # build and save mojo model
        except Exception as err:
            if not ("Trying to predict with an unstable model" in err.args[0]):
                raise Exception(
                    'Deeplearning autoencoder model failed to build.  Fix it.')
            return
    else:
        deeplearningModel = build_save_model(
            params, x, train)  # build and save mojo model
    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        deeplearningModel, TMPDIR, MOJONAME)  # load model and perform predict
    pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
    h2o.save_model(deeplearningModel, path=TMPDIR,
                   force=True)  # save model for debugging
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o,
                                                   pred_mojo,
                                                   prob=1,
                                                   tol=1e-10)
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo,
                                                   pred_pojo,
                                                   prob=1,
                                                   tol=1e-10)