def setup_grid():
    h2o.remove_all()
    hyper_parameters = OrderedDict()
    hyper_parameters["learn_rate"] = [0.1, 0.05, 0.01]
    hyper_parameters["ntrees"] = [1, 3, 5]
    gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters)
    return gs
def pca_pubdev_4167_OOM():
  """
  This pyunit is written to make sure PCA works with customer data.  It is mainly used by customer to verify
  PCA operations and not to be used as a regular test since I do not want to expose customer data.
  """
  h2o.remove_all()
  transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]   # make sure we check all tranforms
  transformN = transform_types[randint(0, len(transform_types)-1)]
  print("transform used on dataset is {0}.\n".format(transformN))

  training_data = h2o.import_file(path=pyunit_utils.locate("/Users/wendycwong/gitBackup/SDatasets/pubdev_4167_Avkash/m120K.tar"))  # Nidhi: import may not work

  gramSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN)
  gramSVDPCA.train(x=list(range(0, training_data.ncols)), training_frame=training_data)

  powerSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN, pca_method="Power")
  powerSVDPCA.train(x=list(range(0, training_data.ncols)), training_frame=training_data)

  # compare singular values and stuff between power and GramSVD methods
  print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
  pyunit_utils.assert_H2OTwoDimTable_equal(gramSVDPCA._model_json["output"]["importance"],
                                           powerSVDPCA._model_json["output"]["importance"],
                                           ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                           tolerance=1e-5, check_all=False)
  print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
  # compare singular vectors
  pyunit_utils.assert_H2OTwoDimTable_equal(gramSVDPCA._model_json["output"]["eigenvectors"],
                                           powerSVDPCA._model_json["output"]["eigenvectors"],
                                           powerSVDPCA._model_json["output"]["names"], tolerance=1e-1,
                                           check_sign=True)
Esempio n. 3
0
def grid_resume():
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    # Run GBM Grid Search
    ntrees_opts = [1,5]
    hyper_parameters = OrderedDict()
    hyper_parameters["ntrees"] = ntrees_opts
    print("GBM grid with the following hyper_parameters:", hyper_parameters)

    export_dir = pyunit_utils.locate("results")
    gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters,
                       export_checkpoints_dir=export_dir)
    gs.train(x=list(range(4)), y=4, training_frame=train)
    grid_id = gs.grid_id
    old_grid_model_count = len(gs.model_ids)
    print("Baseline grid has %d models" % old_grid_model_count)
    h2o.remove_all()

    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    grid = h2o.load_grid(export_dir + "/" + grid_id)
    assert grid is not None
    assert len(grid.model_ids) == old_grid_model_count
    grid.train(x=list(range(4)), y=4, training_frame=train)
    assert len(grid.model_ids) == old_grid_model_count
    print("Newly grained grid has %d models" % len(grid.model_ids))
    
    for model_id in grid.model_ids:
        model = h2o.get_model(model_id)
        assert model is not None
def deepwater_checkpoint():
  if not H2ODeepWaterEstimator.available(): return

  ## build a model
  #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
  frame.drop(0)
  frame[1] = frame[1].asfactor()
  print(frame.head(5))
  model = H2ODeepWaterEstimator(epochs=50, learning_rate=1e-5, stopping_rounds=0, score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0)
  model.train(y=1, training_frame=frame)

  ## save the model
  model_path = h2o.save_model(model)

  ## delete everything - simulate cluster shutdown and restart
  h2o.remove_all()

  ## reimport the model and the frame
  model = h2o.load_model(model_path)
  #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
  frame.drop(0)
  frame[1] = frame[1].asfactor()
  
  ## delete the checkpoint file
  os.remove(model_path)

  ## continue training
  model2 = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-5, stopping_rounds=0,score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0, checkpoint=model.model_id)
  model2.train(y=1, training_frame=frame)
  model2.show()
def test_gam_transformed_frame_serialization():
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
    ))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    myX = ["C1", "C2"]
    myY = "C11"
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    h2o_model = H2OGeneralizedAdditiveEstimator(family="multinomial",
                                                gam_columns=["C6", "C7", "C8"],
                                                keep_gam_cols=True,
                                                scale=[1, 1, 1],
                                                num_knots=[5, 5, 5])
    h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
    gam_frame = h2o.get_frame(
        h2o_model._model_json["output"]["gam_transformed_center_key"])
    tmpdir = tempfile.mkdtemp()
    filename = os.path.join(tmpdir, "gamXFrame.csv")
    h2o.download_csv(gam_frame, filename)
    model_path = h2o.save_model(h2o_model, tmpdir)

    h2o.remove_all()
    loaded_model = h2o.load_model(model_path)
    gam_frame_loaded = h2o.get_frame(
        loaded_model._model_json["output"]["gam_transformed_center_key"])
    gam_frame_original = h2o.import_file(filename)
    pyunit_utils.compare_frames_local(gam_frame_loaded[2:15],
                                      gam_frame_original[2:15],
                                      prob=1,
                                      tol=1e-6)
    print("Test completed.")
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    df = pyunit_utils.random_dataset("regression")       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]
    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)

    frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor
    glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID)   # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
Esempio n. 7
0
def grid_resume():
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Run GBM Grid Search
    ntrees_opts = [1, 3]
    learn_rate_opts = [0.1, .05]
    hyper_parameters = OrderedDict()
    hyper_parameters["learn_rate"] = learn_rate_opts
    hyper_parameters["ntrees"] = ntrees_opts
    print("GBM grid with the following hyper_parameters:", hyper_parameters)
    
    export_dir = pyunit_utils.locate("results")
    gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=train)
    grid_id = gs.grid_id
    old_grid_model_count = len(gs.model_ids)
    print("Baseline grid has %d models" % old_grid_model_count)
    saved_path = h2o.save_grid(export_dir, grid_id)
    h2o.remove_all();

    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    grid = h2o.load_grid(saved_path)
    assert grid is not None
    assert len(grid.model_ids) == old_grid_model_count
    # Modify the hyperspace - should add new models to the grid
    hyper_parameters["ntrees"] = [2,5]
    grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, grid_id = grid.grid_id)
    grid.train(x=list(range(4)), y=4, training_frame=train)
    print("Newly grained grid has %d models" % len(grid.model_ids))
    assert len(grid.model_ids) == 2 * old_grid_model_count
    
    for model_id in grid.model_ids:
        model = h2o.get_model(model_id)
        assert model is not None
def deeplearning_mojo_pojo():
    h2o.remove_all()

    params = set_params()   # set deeplearning model parameters
    df = random_dataset(PROBLEM)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    try:
        deeplearningModel = build_save_model(params, x, train) # build and save mojo model
        h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
        pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearningModel, TMPDIR, MOJONAME)  # load model and perform predict
        pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
        h2o.save_model(deeplearningModel, path=TMPDIR, force=True)  # save model for debugging
        print("Comparing mojo predict and h2o predict...")
        pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-10)    # make sure operation sequence is preserved from Tomk
        print("Comparing pojo predict and h2o predict...")
        pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-10)
    except Exception as ex:
        print("***************  ERROR and type is ")
        print(str(type(ex)))
        print(ex)
        if "AssertionError" in str(type(ex)):   # only care if there is an AssertionError, ignore the others
            sys.exit(1)
Esempio n. 9
0
def glrm_mojo():
    h2o.remove_all()
    train = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_test.csv"))
    predict_10iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_10iter.csv"))
    predict_1iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_1iter.csv"))

    x = train.names
    transformN = "STANDARDIZE"

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234, init="random")
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # save mojo model
    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    # test and make sure setting the iteration number did not screw up the prediction
    predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100) # save mojo predict
    pred_h2o = h2o.get_frame("GLRMLoading_"+predID)
    print("Comparing mojo x Factor and model x Factor for 100 iterations")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
    predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=1) # save mojo predict
    print("Comparing mojo x Factor and model x Factor for 1 iterations")
    pyunit_utils.compare_frames_local(predict_1iter, pred_mojo, 1, tol=1e-10)
    predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=10) # save mojo predict
    print("Comparing mojo x Factor and model x Factor for 10 iterations")
    pyunit_utils.compare_frames_local(predict_10iter, pred_mojo, 1, tol=1e-10)
def test_modelselection_backward_serialization():
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    y = "GLEASON"
    x = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]
    # make sure duplicate runs produce same results
    model_backward = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5,
                                      lambda_=0, theta=0.01)
    model_backward.train(training_frame=d, x=x, y=y)
    model_backward2 = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5,
                                    lambda_=0, theta=0.01)
    model_backward2.train(training_frame=d, x=x, y=y)
    result = model_backward.result()    # get result frame
    result2 = model_backward.result()    # get result frame
    pyunit_utils.compare_frames_local(result[2:5], result2[2:5], prob=1.0) # compare result from both models and they should the same

    num_models = result.nrows           # number of models built
    one_model = h2o.get_model(result["model_id"][num_models-1, 0])
    predict_frame = one_model.predict(d)
    tmpdir = tempfile.mkdtemp()
    file_dir = os.path.join(tmpdir, "predict.csv")
    h2o.download_csv(predict_frame, file_dir) # save one scoring frame
    model_path_backward = model_backward.download_model(tmpdir) # store the model

    h2o.remove_all()
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    loaded_backward_model = h2o.load_model(model_path_backward)    
    result_frame_backward = loaded_backward_model.result()

    model_from_frame_backward = h2o.get_model(result_frame_backward["model_id"][num_models-1, 0])
    pred_frame_backward = model_from_frame_backward.predict(d)
    pred_frame_model = h2o.import_file(file_dir)
    pyunit_utils.compare_frames_local(pred_frame_backward, pred_frame_model, prob=1.0)
Esempio n. 11
0
def check_big_merge():
    h2o.remove_all()
    nrow = 1000000
    ncol = 2
    iRange = 100000
    frame1 = h2o.create_frame(rows=nrow,
                              cols=ncol,
                              integer_fraction=1,
                              seed=12345,
                              integer_range=iRange,
                              missing_fraction=0.0)
    frame2 = h2o.create_frame(rows=nrow,
                              cols=ncol,
                              integer_fraction=1,
                              seed=54321,
                              integer_range=iRange,
                              missing_fraction=0.0)

    frame1.set_names(["C1", "C2"])
    frame2.set_names(["C1", "C3"])

    mergedExact = frame1.merge(frame2,
                               by_x=["C1"],
                               by_y=["C1"],
                               all_x=False,
                               all_y=False)
    mergedLeft = frame1.merge(frame2, by_x=["C1"], by_y=["C1"], all_x=True)

    assert mergedExact.nrow < mergedLeft.nrow, "Expected row numbers are wrong"
Esempio n. 12
0
def glm_binomial_mojo_pojo():
    h2o.remove_all()
    NTESTROWS = 200  # number of test dataset rows
    PROBLEM = "binomial"
    params = set_params()  # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})
    TMPDIR = tempfile.mkdtemp()
    glmBinomialModel = pyunit_utils.build_save_model_generic(
        params, x, train, "response", "glm",
        TMPDIR)  # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(glmBinomialModel._id)

    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glmBinomialModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmBinomialModel, TMPDIR, MOJONAME)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(
        pred_h2o, pred_mojo, 0.1, tol=1e-10
    )  # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
Esempio n. 13
0
def test_anovaglm_serialization():
    train = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    y = 'CAPSULE'
    x = ['AGE', 'VOL', 'DCAPS']
    train[y] = train[y].asfactor()
    anovaglm_model = anovaglm(family='binomial',
                              lambda_=0,
                              missing_values_handling="skip")
    anovaglm_model.train(x=x, y=y, training_frame=train)

    tmpdir = tempfile.mkdtemp()
    model_path = anovaglm_model.download_model(tmpdir)
    result_frame_filename = os.path.join(tmpdir, "result_frame.csv")
    h2o.download_csv(anovaglm_model.result(), result_frame_filename)

    h2o.remove_all()
    result_frame_original = h2o.import_file(result_frame_filename)
    loaded_anovaglm_model = h2o.load_model(model_path)
    result_frame_loaded = loaded_anovaglm_model.result()
    for cind in list(range(0, result_frame_original.ncols)):
        for rind in list(range(0, result_frame_original.nrows)):
            if result_frame_original.type(cind) == 'real':
                assert abs(result_frame_original[rind, cind]-result_frame_loaded[rind, cind]) < 1e-6, \
                    "Expected: {0}. Actual: {1}".format(result_frame_original[rind, cind], result_frame_loaded[rind, cind])
            else:
                assert result_frame_original[rind, cind]==result_frame_loaded[rind, cind], \
                    "Expected: {0}. Actual: {1}".format(result_frame_original[rind, cind], result_frame_loaded[rind, cind])
Esempio n. 14
0
def grid_export_with_cv():
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    # Run GBM Grid Search
    hyper_parameters = OrderedDict()
    hyper_parameters["ntrees"] = [1, 2]

    # train with CV
    gs = H2OGridSearch(H2OGradientBoostingEstimator(nfolds=2, keep_cross_validation_predictions=True, seed=42),
                       hyper_params=hyper_parameters)
    gs.train(x=list(range(4)), y=4, training_frame=train)

    holdout_frame_ids = map(lambda m: m.cross_validation_holdout_predictions().frame_id, gs.models)

    export_dir = pyunit_utils.locate("results")
    saved_path = h2o.save_grid(export_dir, gs.grid_id, export_cross_validation_predictions=True)

    h2o.remove_all()

    grid = h2o.load_grid(saved_path)

    assert grid is not None
    for holdout_frame_id in holdout_frame_ids:
        assert h2o.get_frame(holdout_frame_id) is not None

    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    stack = H2OStackedEnsembleEstimator(base_models=grid.model_ids)
    stack.train(x=list(range(4)), y=4, training_frame=train)

    predicted = stack.predict(train)
    assert predicted.nrow == train.nrow
def gam_gaussian_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    PROBLEM="gaussian"
    params = set_params()   # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM, missing_fraction=0.001)   # generate random dataset
    dfnames = df.names
    # add GAM specific parameters
    params["gam_columns"] = []
    params["scale"] = []
    count = 0
    num_gam_cols = 3    # maximum number of gam columns
    for cname in dfnames:
        if not(cname == 'response') and (str(df.type(cname)) == "real"):
            params["gam_columns"].append(cname)
            params["scale"].append(0.001)
            count = count+1
            if (count >= num_gam_cols):
                break
    
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    TMPDIR = tempfile.mkdtemp()
    gamGaussianModel = pyunit_utils.build_save_model_generic(params, x, train, "response", "gam", TMPDIR) # build and save mojo model
    MOJONAME = pyunit_utils.getMojoName(gamGaussianModel._id)
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(gamGaussianModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10)    # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
Esempio n. 16
0
def test_frame_reload():
    work_dir = tempfile.mkdtemp()
    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    df_key = iris.key
    df_pd_orig = iris.as_data_frame()
    iris.save(work_dir)
    try:
        iris.save(work_dir, force=False)  # fails because file exists
    except H2OResponseError as e:
        assert e.args[0].exception_msg.startswith("File already exists")
    try:
        h2o.load_frame(df_key, work_dir,
                       force=False)  # fails because frame exists
    except H2OResponseError as e:
        assert e.args[
            0].exception_msg == "Frame Key<Frame> iris_wheader.hex already exists."
    df_loaded_force = h2o.load_frame(df_key, work_dir)
    h2o.remove(iris)
    df_loaded = h2o.load_frame(df_key, work_dir, force=False)
    df_pd_loaded_force = df_loaded_force.as_data_frame()
    df_pd_loaded = df_loaded.as_data_frame()
    assert df_pd_orig.equals(df_pd_loaded_force)
    assert df_pd_orig.equals(df_pd_loaded)

    # try running grid search on the frame
    h2o.remove_all()
    df_loaded = h2o.load_frame(df_key, work_dir)
    hyper_parameters = OrderedDict()
    hyper_parameters["ntrees"] = [5, 10, 20, 30]
    grid_small = H2OGridSearch(H2OGradientBoostingEstimator,
                               hyper_params=hyper_parameters)
    grid_small.train(x=list(range(4)), y=4, training_frame=df_loaded)
    assert len(grid_small.models) == 4
Esempio n. 17
0
def check_story(story_name, paragraphs):
    h2o.remove_all()

    h2o.log_and_echo("------------------------------------------------------------")
    h2o.log_and_echo("")
    h2o.log_and_echo("CHECKING: {0}".format(story_name))
    h2o.log_and_echo("")
    h2o.log_and_echo("------------------------------------------------------------")

    # 1. Combine the related, individual code paragraphs into a single, coherent python story
    story = []
    for p in paragraphs:
        with open(p, "r") as f: story = story + f.readlines()

    # 2. Execute the story

    # first, remove any h2o.init calls
    remove_lines = []
    for idx, l in enumerate(story):
        if "h2o.init" in l: remove_lines.append(idx)
    story = [i for j, i in enumerate(story) if j not in remove_lines]

    # write the story that will be executed to the results directory for future reference
    story_file = os.path.join(results_dir(), test_name()+"."+story_name+".code")
    with open(story_file, 'w') as f: f.writelines(story)

    # run it
    with open(story_file, "r") as s: booklet = s.read()
    booklet_c = compile(booklet, '<string>', 'exec')
    p = {}
    exec(booklet_c, p)
Esempio n. 18
0
def save_load_mode_with_cv():
    prostate = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()

    prostate_gbm = H2OGradientBoostingEstimator(
        nfolds=2, keep_cross_validation_predictions=True)
    prostate_gbm.train(x=["AGE", "RACE", "PSA", "DCAPS"],
                       y="CAPSULE",
                       training_frame=prostate)
    path = pyunit_utils.locate("results")

    model_path = h2o.save_model(prostate_gbm,
                                path=path,
                                force=True,
                                export_cross_validation_predictions=True)
    assert os.path.isfile(
        model_path
    ), "Expected model artifact {0} to exist, but it does not.".format(
        model_path)

    h2o.remove_all()

    prostate_gbm_reloaded = h2o.load_model(model_path)
    assert isinstance(prostate_gbm_reloaded, H2OGradientBoostingEstimator), \
        "Expected H2OGradientBoostingEstimator, but got {0}".format(prostate_gbm_reloaded)

    holdout_frame_id = prostate_gbm.cross_validation_holdout_predictions(
    ).frame_id
    assert h2o.get_frame(holdout_frame_id) is not None
Esempio n. 19
0
def deeplearning_mojo_pojo():
    h2o.remove_all()

    params = set_params()  # set deeplearning model parameters
    df = random_dataset(PROBLEM)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    try:
        deeplearningModel = build_save_model(
            params, x, train)  # build and save mojo model
        h2o.download_csv(test[x], os.path.join(
            TMPDIR,
            'in.csv'))  # save test file, h2o predict/mojo use same file
        pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
            deeplearningModel, TMPDIR,
            MOJONAME)  # load model and perform predict
        # pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
        h2o.save_model(deeplearningModel, path=TMPDIR,
                       force=True)  # save model for debugging
        print("Comparing mojo predict and h2o predict...")
        pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-6)
    #  print("Comparing pojo predict and h2o predict...")
    #  pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-6)
    except Exception as ex:
        print("***************  ERROR and type is ")
        print(str(type(ex)))
        print(ex)
        if "AssertionError" in str(
                type(ex)
        ):  # only care if there is an AssertionError, ignore the others
            sys.exit(1)
Esempio n. 20
0
def check_story(story_name, paragraphs):
    h2o.remove_all()

    h2o.log_and_echo("------------------------------------------------------------")
    h2o.log_and_echo("")
    h2o.log_and_echo("CHECKING: {0}".format(story_name))
    h2o.log_and_echo("")
    h2o.log_and_echo("------------------------------------------------------------")

    # 1. Combine the related, individual code paragraphs into a single, coherent python story
    story = []
    for p in paragraphs:
        with open(p, "r") as f: story = story + f.readlines()

    # 2. Execute the story

    # first, remove any h2o.init calls
    remove_lines = []
    for idx, l in enumerate(story):
        if "h2o.init" in l: remove_lines.append(idx)
    story = [i for j, i in enumerate(story) if j not in remove_lines]

    # write the story that will be executed to the results directory for future reference
    story_file = os.path.join(results_dir(), test_name()+"."+story_name+".code")
    with open(story_file, 'w') as f: f.writelines(story)

    # run it
    with open(story_file, "r") as s: booklet = s.read()
    booklet_c = compile(booklet, '<string>', 'exec')
    p = {}
    exec(booklet_c, p)
Esempio n. 21
0
def train_with_h2o_full_data(time):
    import h2o
    from h2o.automl import H2OAutoML
    h2o.init()
    h2o.remove_all()
    raw_train_df = h2o.import_file(
        path='data/train_dataset/train_dataset_temp.csv')
    raw_test_df = h2o.import_file(
        path='data/test_dataset/test_dataset_temp.csv')
    train_df = raw_train_df[:, 1:]
    y = 'var_29'
    col_list = train_df.columns
    print(col_list)
    x = col_list[:-1]
    print('x:', x)
    print('y:', y)
    splits = train_df.split_frame(ratios=[0.9], seed=1)
    train = splits[0]
    test = splits[1]
    #part 50000行
    # aml1 = H2OAutoML(max_runtime_secs=time,balance_classes=False,stopping_tolerance=0.005,stopping_rounds=50,sort_metric='MAE',stopping_metric='MAE',seed=2019, project_name="part_data_train")
    # aml1.train(x=x,y=y,training_frame=train,leaderboard_frame=test)
    #full data train
    aml2 = H2OAutoML(max_runtime_secs=time,
                     balance_classes=False,
                     stopping_tolerance=0.005,
                     stopping_rounds=50,
                     sort_metric='MAE',
                     stopping_metric='MAE',
                     seed=2019,
                     project_name="full_data_train")
    aml2.train(x=x, y=y, training_frame=train_df)
    # path1=h2o.save_model(model=aml1,path='models',force=True)
    # path2=h2o.save_model(model=aml2,path='models',force=True)
    # print(aml1.leaderboard)
    print('++++++++++++++++++++++')
    print(aml2.leaderboard)
    # ans1=aml1.predict(raw_test_df[:,1:])
    ans2 = aml2.predict(raw_test_df[:, 1:])
    # print(ans1)
    print(ans2)
    # ans1=ans1.as_data_frame()
    ans2 = ans2.as_data_frame()
    # ans1.to_csv('data/ans1.csv',index=False)
    ans2.to_csv('data/ans2_time_{}.csv'.format(str(time)), index=False)

    # res1=pd.DataFrame()
    temp = pd.read_csv('data/test_dataset/test_dataset_temp.csv')
    # res1['id']=temp['var_0']
    # res1['score']=ans1.values
    # res1.to_csv('data/h2o_pred_submission_v1.csv',index=False)

    res2 = pd.DataFrame()
    res2['id'] = temp['var_0']
    res2['score'] = ans2.values
    res2['score'] = res2['score'].apply(lambda x: int(x)
                                        if (x - int(x)) < 0.5 else int(x) + 1)
    res2.to_csv('data/h2o_pred_submission_int_v2_time_{}.csv'.format(time),
                index=False)
Esempio n. 22
0
def setup_grid():
    h2o.remove_all()
    hyper_parameters = OrderedDict()
    hyper_parameters["learn_rate"] = [0.1, 0.05, 0.01]
    hyper_parameters["ntrees"] = [1, 3, 5]
    gs = H2OGridSearch(H2OGradientBoostingEstimator,
                       hyper_params=hyper_parameters)
    return gs
Esempio n. 23
0
def impute_missing_values(data_frame, columns):
    if not isinstance(data_frame, pd.DataFrame):
        return
    if not isinstance(columns, list):
        return

    # Impute summary
    impute_summary = {}

    # result frame
    result_frame = pd.DataFrame(data_frame)

    # Start h2o server
    h2o.init(max_mem_size_GB=5)
    for column in columns:
        print "Processing :", column

        # Defining columns
        response_column = column
        training_columns = list(data_frame.columns)
        training_columns.remove(response_column)

        # Creating h2o frame
        training_frame = h2o.H2OFrame(data_frame)
        training_frame.set_names(list(data_frame.columns))

        # Defining model
        model = H2ORandomForestEstimator(ntrees=75,
                                         max_depth=25,
                                         nbins=25,
                                         binomial_double_trees=True,
                                         nfolds=10)
        model.train(x=training_columns,
                    y=response_column,
                    training_frame=training_frame)

        # Predict values
        predictions = model.predict(test_data=training_frame)
        predictions = list(map(float, h2OColumnToList(predictions)))

        # Add predictions to the result frame
        result_frame[column] = predictions

        actual = data_frame[column]
        predicted = result_frame[column]

        rmse = sqrt(mean_squared_error(actual, predicted))
        impute_summary[column] = ('RMSE', rmse)

    # Removing all processes
    h2o.remove_all()

    # Displaying impute summary
    for key in impute_summary:
        print impute_summary[key], key

    return result_frame
def test_stacked_ensemble_is_able_to_use_imported_base_models():
    import tempfile, shutil, glob
    train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"))
    x = train.columns
    y = "species"
    x.remove(y)

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    gbm.train(x=x, y=y, training_frame=train)
    drf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True)
    drf.train(x=x, y=y, training_frame=train)

    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     validation_frame=test,
                                     base_models=[gbm.model_id, drf.model_id])
    se.train(x=x, y=y, training_frame=train)

    assert len(se.base_models) == 2

    TMP_DIR = tempfile.mkdtemp()
    try:
        h2o.save_model(gbm, TMP_DIR + "/gbm.model")
        h2o.save_model(drf, TMP_DIR + "/drf.model")

        gbm_holdout_id = gbm.cross_validation_holdout_predictions().frame_id
        drf_holdout_id = drf.cross_validation_holdout_predictions().frame_id
        h2o.export_file(gbm.cross_validation_holdout_predictions(), TMP_DIR + "/gbm.holdout")
        h2o.export_file(drf.cross_validation_holdout_predictions(), TMP_DIR + "/drf.holdout")

        h2o.remove_all()

        h2o.import_file(TMP_DIR + "/gbm.holdout", gbm_holdout_id)
        h2o.import_file(TMP_DIR + "/drf.holdout", drf_holdout_id)

        gbm = h2o.upload_model(glob.glob(TMP_DIR + "/gbm.model/*")[0])
        drf = h2o.upload_model(glob.glob(TMP_DIR + "/drf.model/*")[0])

        train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"), "some_other_name_of_training_frame")
        test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"), "some_other_name_of_test_frame")
        x = train.columns
        y = "species"
        x.remove(y)

        se_loaded = H2OStackedEnsembleEstimator(training_frame=train,
                                                validation_frame=test,
                                                base_models=[gbm.model_id, drf.model_id])
        se_loaded.train(x=x, y=y, training_frame=train)

        assert len(se_loaded.base_models) == 2
    finally:
        shutil.rmtree(TMP_DIR)
def validacion_r(modelo, hyper_parameters, datos, variables, semilla=1234):    
    h2o.init(max_mem_size=14) 
    train = h2o.H2OFrame(datos[0])
    tipificar_h2o(train)
    splits = train.split_frame(ratios=[0.7], seed=semilla)
    gs = H2OGridSearch(modelo, hyper_params=hyper_parameters)
    gs.train(x=variables, y="Tendencia", training_frame=splits[0])
    resultados=procesamiento_resultados_binario(gs,splits,datos)
    h2o.remove_all()
    return(resultados)
def deeplearning_mojo_pojo():
    h2o.remove_all()
    problemtypes = ["regression", "binomial", "multinomial"]
    autoEncoderOn = [True, False]

    for encoderOn in autoEncoderOn:
        for problem in problemtypes:
            print("AutoEncoderOn is: {0} and problem type is: {1}".format(encoderOn, problem))
            random.seed(9876) # set python random seed
            runComparisonTests(encoderOn, problem)
def h2o_test_setup(sys_args):
    h2o_py_dir = os.path.realpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), ".."))
    h2o_docs_dir = os.path.realpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..",
                     "h2o-docs"))

    parse_args(sys_args)

    sys.path.insert(1, h2o_py_dir)
    import h2o
    from tests import pyunit_utils, pydemo_utils, pybooklet_utils

    for pkg in (pyunit_utils, pybooklet_utils):
        setattr(pkg, '__on_hadoop__', _ON_HADOOP_)
        setattr(pkg, '__hadoop_namenode__', _HADOOP_NAMENODE_)
        setattr(pkg, '__test_name__', _TEST_NAME_)
        setattr(pkg, '__results_dir__', _RESULTS_DIR_)

    if _IS_PYUNIT_ or _IS_IPYNB_ or _IS_PYBOOKLET_ or _IS_PYDEMO_:
        pass
    else:
        raise (
            EnvironmentError,
            "Unrecognized test type. Must be of type ipynb, pydemo, pyunit, or pybooklet, but got: "
            "{0}".format(_TEST_NAME_))

    print("[{0}] {1}\n".format(
        strftime("%Y-%m-%d %H:%M:%S", gmtime()),
        "Connect to h2o on IP: {0} PORT: {1}".format(_H2O_IP_, _H2O_PORT_)))
    auth = None
    if _LDAP_USER_NAME_ is not None and _LDAP_PASSWORD_ is not None:
        auth = (_LDAP_USER_NAME_, _LDAP_PASSWORD_)
    h2o.connect(ip=_H2O_IP_, port=_H2O_PORT_, verbose=False, auth=auth)
    h2o.utils.config.H2OConfigReader.get_config(
    )["general.allow_breaking_changes"] = True

    #rest_log = os.path.join(_RESULTS_DIR_, "rest.log")
    #h2o.start_logging(rest_log)
    #print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Started rest logging in: {0}".format(rest_log))

    h2o.log_and_echo(
        "------------------------------------------------------------")
    h2o.log_and_echo("")
    h2o.log_and_echo("STARTING TEST: " + _TEST_NAME_)
    h2o.log_and_echo("")
    h2o.log_and_echo(
        "------------------------------------------------------------")

    h2o.remove_all()

    if _IS_IPYNB_: pydemo_utils.ipy_notebook_exec(_TEST_NAME_)
    elif _IS_PYUNIT_: pyunit_utils.pyunit_exec(_TEST_NAME_)
    elif _IS_PYBOOKLET_: pybooklet_utils.pybooklet_exec(_TEST_NAME_)
    elif _IS_PYDEMO_: pydemo_utils.pydemo_exec(_TEST_NAME_)
Esempio n. 28
0
def standalone_test(test):
    h2o.init(strict_version_check=False)

    h2o.remove_all()

    h2o.log_and_echo("------------------------------------------------------------")
    h2o.log_and_echo("")
    h2o.log_and_echo("STARTING TEST")
    h2o.log_and_echo("")
    h2o.log_and_echo("------------------------------------------------------------")
    test()
Esempio n. 29
0
def standalone_test(test):
    h2o.init(strict_version_check=False)

    h2o.remove_all()

    h2o.log_and_echo("------------------------------------------------------------")
    h2o.log_and_echo("")
    h2o.log_and_echo("STARTING TEST")
    h2o.log_and_echo("")
    h2o.log_and_echo("------------------------------------------------------------")
    test()
def sort():
    h2o.remove_all()
    df = h2o.import_file(
        pyunit_utils.locate(
            "bigdata/laptop/jira/PUBDEV_6829_srot_bug_bigKey_part.csv.zip"))
    t1 = time.time()
    df1 = df.sort([1])
    assert df1[0, 1] <= df1[1, 1], "Test failed: Sort bug."
    print("Time taken to perform sort is {0}".format(time.time() - t1))
    pyunit_utils.check_sorted_1_column(df1, 1, prob=0.00001,
                                       ascending=True)  # check some rows
Esempio n. 31
0
def deepwater_lenet():
  if not H2ODeepWaterEstimator.available(): return

  frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  print(frame.head(5))
  model = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-3, network='lenet', score_interval=0, train_samples_per_iteration=1000)
  model.train(x=[0],y=1, training_frame=frame)
  model.show()
  error = model.model_performance(train=True).mean_per_class_error()
  h2o.remove_all()
  assert error < 0.1, "mean classification error is too high : " + str(error)
def test_rbind_summary():
    h2o.remove_all()
    df = h2o.H2OFrame([1, 2, 5.5], destination_frame="df")  # original frame
    dfr = h2o.H2OFrame([5.5, 1, 2],
                       destination_frame="dfr")  # reversed row content
    df1 = df[2, :]
    df2 = df[:2, :]
    summary = df1.summary(return_data=True)
    df3 = df1.rbind(df2)  # fixed
    df3r = df2.rbind(df1)

    compareFramesLocal(dfr, df3)  # should contain 5.5, 1, 2
    compareFramesLocal(df, df3r)  # should contain 1,2,5.5

    df1 = df[
        3, :]  # this will result in an NA since we do not have 4 rows in df.
    dfr[0, 0] = float('nan')
    df4 = df1.rbind(df2)
    compareFramesLocal(df4, dfr)  # should contain NA, 1, 2

    # performing the same test with an additionl categorical column per Michalk request.
    h2o.remove_all()
    df = h2o.H2OFrame([[1, "a"], [2, "b"], [5.5, "c"]],
                      destination_frame="dfc")  # original frame
    df[1] = df[1].asfactor()
    dfr = h2o.H2OFrame([[5.5, "c"], [1, "a"], [2, "b"]],
                       destination_frame="dfrc")  # reversed row content
    dfr[1] = df[1].asfactor(
    )  # this somehow switch the row content of the factor column to be alphabetical
    dfr[0, 1] = 'c'
    dfr[1, 1] = 'a'
    dfr[2, 1] = 'b'
    df1 = df[2, :]
    df2 = df[:2, :]
    summary = df1.summary(return_data=True)
    df3 = df1.rbind(df2)  # fixed
    df3r = df2.rbind(df1)
    compareFramesLocal(dfr, df3)  # should contain 5.5, 1, 2
    compareFramesLocal(df, df3r)  # should contain 1,2,5.5

    # copying test from Michalk
    df1 = h2o.H2OFrame([[1, "a"], [2, "b"]])
    df1[1] = df1[1].asfactor()

    df2 = h2o.H2OFrame([[2.2, "b"], [1.1, "a"]])
    df2[1] = df2[1].asfactor()

    print(df1.summary())
    print(df2.summary())

    df3 = df1.rbind(df2)
    assert df3.nrow==(df1.nrow+df2.nrow), "Expected rbind rows: {0}, actual rows: " \
                                          "{1}".format(df1.nrow+df2.nrow, df3.nrow)
Esempio n. 33
0
def h2o_test_setup(sys_args):
    h2o_py_dir = os.path.realpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), ".."))
    h2o_docs_dir = os.path.realpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..",
                     "h2o-docs"))

    parse_args(sys_args)

    sys.path.insert(1, h2o_py_dir)
    import h2o
    from tests import pyunit_utils, pydemo_utils, pybooklet_utils

    set_pyunit_pkg_attrs(pyunit_utils)
    set_pybooklet_pkg_attrs(pybooklet_utils)

    if _IS_PYUNIT_ or _IS_IPYNB_ or _IS_PYBOOKLET_ or _IS_PYDEMO_:
        pass
    else:
        raise (
            EnvironmentError,
            "Unrecognized test type. Must be of type ipynb, pydemo, pyunit, or pybooklet, but got: "
            "{0}".format(_TEST_NAME_))

    print("[{0}] {1}\n".format(
        strftime("%Y-%m-%d %H:%M:%S", gmtime()),
        "Connect to h2o on IP: {0} PORT: {1}".format(_H2O_IP_, _H2O_PORT_)))
    h2o.init(ip=_H2O_IP_,
             port=_H2O_PORT_,
             strict_version_check=False,
             force_connect=_FORCE_CONNECT_)
    h2o.utils.config.H2OConfigReader.get_config(
    )["general.allow_breaking_changes"] = True

    #rest_log = os.path.join(_RESULTS_DIR_, "rest.log")
    #h2o.start_logging(rest_log)
    #print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Started rest logging in: {0}".format(rest_log))

    h2o.log_and_echo(
        "------------------------------------------------------------")
    h2o.log_and_echo("")
    h2o.log_and_echo("STARTING TEST: " + _TEST_NAME_)
    h2o.log_and_echo("")
    h2o.log_and_echo(
        "------------------------------------------------------------")

    h2o.remove_all()

    if _IS_IPYNB_: pydemo_utils.ipy_notebook_exec(_TEST_NAME_)
    elif _IS_PYUNIT_: pyunit_utils.pyunit_exec(_TEST_NAME_)
    elif _IS_PYBOOKLET_: pybooklet_utils.pybooklet_exec(_TEST_NAME_)
    elif _IS_PYDEMO_: pydemo_utils.pydemo_exec(_TEST_NAME_)
def javamunge_assembly():
    h2o.remove_all()
    train = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3a.csv")
    test = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3b.csv")

    # lending-club munging assembly
    print("Import and Parse data")
    # Add "earliest_cr_line" and "issue_d" and cast as strings to aide Cliff's PR on 7/13
    types = {"int_rate": "string", "revol_util": "string", "emp_length": "string", "earliest_cr_line": "string",
             "issue_d": "string", "last_credit_pull_d": "factor"}

    data = h2o.import_file(path=train, col_types=types)
    test = h2o.import_file(path=test,  col_types=data.types) ## use the same data types as the training set for the test set
    test = test[[1,5,19,23,45,66,99,590,8903,9999,10001,23892,23893,50123],:]
    test = h2o.assign(test,"test")

    assembly = H2OAssembly(
      steps=[
        # munge int_rate column in place
        # strip %, trim ws, convert to double
        ("intrate_rm_junk_char", H2OColOp(op=H2OFrame.gsub,      col="int_rate", inplace=True, pattern="%", replacement="")),  # strip %
        ("intrate_trim_ws",      H2OColOp(op=H2OFrame.trim,      col="int_rate", inplace=True)),                               # trim ws
        ("intrate_as_numeric",   H2OColOp(op=H2OFrame.asnumeric, col="int_rate", inplace=True)),                               # string -> double

        # munge the revol_util in the same way as the int_rate column
        ("revol_rm_junk_char", H2OColOp(op=H2OFrame.gsub,      col="revol_util", inplace=True, pattern="%", replacement="")),  # strip %
        ("revol_trim_ws",      H2OColOp(op=H2OFrame.trim,      col="revol_util", inplace=True)),                               # trim ws
        ("revol_as_numeric",   H2OColOp(op=H2OFrame.asnumeric, col="revol_util", inplace=True)),                               # string -> double

        # munge earliest_cr_line column (mm-YYYY format)
        # split into Month and Year columns
        ("earliest_cr_line_split", H2OColOp(H2OFrame.strsplit, col="earliest_cr_line", inplace=False, new_col_name=["earliest_cr_line_Month","earliest_cr_line_Year"], pattern="-")),  # split on '-'
        ("earliest_cr_line_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="earliest_cr_line_Year", inplace=True)),                                                              # string -> double

        # munge issue_d column in same way as earliest_cr_line column
        ("issue_date_split", H2OColOp(op=H2OFrame.strsplit, col="issue_d", inplace=False, new_col_name=["issue_d_Month", "issue_d_Year"], pattern="-")),                               # split on '-'
        ("issue_d_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="issue_d_Year", inplace=True)),                                                                                # string -> double

        # do some munging of the emp_length column
        ("emp_length_rm_years",  H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="([ ]*+[a-zA-Z].*)|(n/a)", replacement="")),  # remove " year" and " years", also translate n/a to ""
        ("emp_length_trim",      H2OColOp(op=H2OFrame.trim, col="emp_length", inplace=True)),                                                     # trim all the WS off
        ("emp_length_lt1_point5",H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="< 1",    replacement="0.5")),                # translate < 1 => 0.5
        ("emp_length_10plus",    H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="10\\+",    replacement="10")),               # translate 10+ to 10
        ("emp_length_as_numeric",H2OColOp(op=H2OFrame.asnumeric, col="emp_length", inplace=True)),                                                # string -> double

        # compute credit length
        ("credit_length", H2OBinaryOp(op=H2OAssembly.minus, col="issue_d_Year",inplace=False, new_col_name="longest_credit_length",right=H2OCol("earliest_cr_line_Year")))

      ])

    res = assembly.fit(data)
    pyunit_utils.javamunge(assembly, "AssemblyMungingDemoPojo", test)
Esempio n. 35
0
def __ml_train(X, extra_crispr_df, y, train_index, test_index):

    logger.debug("Creating h2o working environment")
    # ### Start H2O
    # Start up a 1-node H2O cloud on your local machine, and allow it to use all CPU cores and up to 2GB of memory:
    h2o.init(max_mem_size="2G")
    h2o.remove_all()
    logger.debug("Created h2o working environment successfully")

    from h2o.estimators import H2ORandomForestEstimator

    rf_crispr = H2ORandomForestEstimator(model_id="rf_crispr",
                                         categorical_encoding="enum",
                                         nfolds=5,
                                         ntrees=30,
                                         stopping_rounds=30,
                                         score_each_iteration=True,
                                         seed=10)

    seq_data = X.iloc[:, :config.seq_len]
    seq_data.columns = ['pos_' + str(i) for i in range(len(seq_data.columns))]
    pre_h2o_df = pd.concat([seq_data, extra_crispr_df, y], axis=1)
    h2o_crispr_df_train = h2o.H2OFrame(pre_h2o_df.loc[train_index, :])
    h2o_crispr_df_test = h2o.H2OFrame(pre_h2o_df.loc[test_index, :])

    logger.debug("Training machine learning model")
    rf_crispr.train(x=h2o_crispr_df_train.col_names[:-1],
                    y=h2o_crispr_df_train.col_names[-1],
                    training_frame=h2o_crispr_df_train)
    logger.debug("Trained successfully. Output feature importance")
    feature_importance = rf_crispr._model_json['output'][
        'variable_importances'].as_data_frame()[['variable', 'percentage']]
    feature_importance.to_csv(config.feature_importance_path, index=False)

    logger.debug("Predicting training data")
    test_prediction_train = rf_crispr.predict(h2o_crispr_df_train[:-1])
    performance = spearmanr(test_prediction_train.as_data_frame()['predict'],
                            h2o_crispr_df_train.as_data_frame()['log2fc'])[0]
    logger.debug(
        "spearman correlation coefficient for training dataset is: %f" %
        performance)

    logger.debug("Predicting test data")
    test_prediction = rf_crispr.predict(h2o_crispr_df_test[:-1])
    performance = spearmanr(test_prediction.as_data_frame()['predict'],
                            h2o_crispr_df_test.as_data_frame()['log2fc'])[0]
    logger.debug("spearman correlation coefficient for test dataset is: %f" %
                 performance)

    logger.debug("Saving model")
    h2o.save_model(rf_crispr, config.ml_model_path)
    logger.debug("Saved model to disk")
Esempio n. 36
0
def init():
    global sr, fr, share_cols

    # <hack> regarding output to make h2o work in IDLE
    class PseudoTTY(object):
        def __init__(self, underlying):
            underlying.encoding = 'cp437'
            self.__underlying = underlying

        def __getattr__(self, name):
            return getattr(self.__underlying, name)

        def isatty(self):
            return True

    import sys
    sys.stdout = PseudoTTY(sys.stdout)
    # </hack>

    h2o.init(nthreads=-1, max_mem_size="58G")
    h2o.remove_all()

    init()
    femq12 = pd.read_csv(
        r"H:\Ashwin\dta\features\All_return_features_sample.csv")
    # femq12['fold'] = (femq12['TIN_hash_byte']/32).astype(int)

    fr = h2o.H2OFrame(python_obj=femq12)
    print 'setting factors...'
    fr = set_return_factors(fr)
    fr = set_profile_factors(fr)
    fr = set_match_factors(fr)
    fr = set_transaction_factors(fr)
    fr = set_purchasenetwork_factors(fr)
    fr = set_salenetwork_factors(fr)
    fr = set_downstream_factors(fr)
    return fr

    fr['Missing_SalesDSUnTaxProp'] = fr['Missing_SalesDSUnTaxProp'].asfactor()
    fr['Missing_SalesDSCreditRatio'] = fr[
        'Missing_SalesDSCreditRatio'].asfactor()
    fr['Missing_SalesDSVatRatio'] = fr['Missing_SalesDSVatRatio'].asfactor()
    fr['Missing_MaxSalesProp'] = fr['Missing_MaxSalesProp'].asfactor()
    fr['Missing_MaxPurchaseProp'] = fr['Missing_MaxPurchaseProp'].asfactor()
    fr['Missing_PurchaseDSUnTaxProp'] = fr[
        'Missing_PurchaseDSUnTaxProp'].asfactor()
    fr['Missing_PurchaseDSCreditRatio'] = fr[
        'Missing_PurchaseDSCreditRatio'].asfactor()
    fr['Missing_PurchaseDSVatRatio'] = fr[
        'Missing_PurchaseDSVatRatio'].asfactor()
 def __init__(self,
              model,
              X_test,
              feature_names,
              max_depth,
              model_id='surrogate_mojo'):
     self.model = model
     self.X_test = np.array(X_test)
     self.y_test = np.array(model.predict(X_test))
     self.feature_names = feature_names
     self.max_depth = max_depth
     self.model_id = model_id
     h2o.init(max_mem_size='2G')  # start h2o
     h2o.remove_all()  # remove any existing data structures from h2o memory
Esempio n. 38
0
def javamunge_assembly():
    h2o.remove_all()
    train = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3a.csv")
    test  = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3b.csv")

    # lending-club munging assembly
    print("Import and Parse data")
    types = {"int_rate":"String", "revol_util":"String", "emp_length":"String"}
    data = h2o.import_file(path=train, col_types=types)
    test = h2o.import_file(path=test,  col_types=types)
    test = test[[1,5,19,23,45,66,99,590,8903,9999,10001,23892,23893,50123],:]
    test = h2o.assign(test,"test")

    assembly = H2OAssembly(
      steps=[
        # munge int_rate column in place
        # strip %, trim ws, convert to double
        ("intrate_rm_junk_char", H2OColOp(op=H2OFrame.gsub,      col="int_rate", inplace=True, pattern="%", replacement="")),  # strip %
        ("intrate_trim_ws",      H2OColOp(op=H2OFrame.trim,      col="int_rate", inplace=True)),                               # trim ws
        ("intrate_as_numeric",   H2OColOp(op=H2OFrame.asnumeric, col="int_rate", inplace=True)),                               # string -> double

        # munge the revol_util in the same way as the int_rate column
        ("revol_rm_junk_char", H2OColOp(op=H2OFrame.gsub,      col="revol_util", inplace=True, pattern="%", replacement="")),  # strip %
        ("revol_trim_ws",      H2OColOp(op=H2OFrame.trim,      col="revol_util", inplace=True)),                               # trim ws
        ("revol_as_numeric",   H2OColOp(op=H2OFrame.asnumeric, col="revol_util", inplace=True)),                               # string -> double

        # munge earliest_cr_line column (mm-YYYY format)
        # split into Month and Year columns
        ("earliest_cr_line_split", H2OColOp(H2OFrame.strsplit, col="earliest_cr_line", inplace=False, new_col_name=["earliest_cr_line_Month","earliest_cr_line_Year"], pattern="-")),  # split on '-'
        ("earliest_cr_line_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="earliest_cr_line_Year", inplace=True)),                                                              # string -> double

        # munge issue_d column in same way as earliest_cr_line column
        ("issue_date_split", H2OColOp(op=H2OFrame.strsplit, col="issue_d", inplace=False, new_col_name=["issue_d_Month", "issue_d_Year"], pattern="-")),                               # split on '-'
        ("issue_d_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="issue_d_Year", inplace=True)),                                                                                # string -> double

        # do some munging of the emp_length column
        ("emp_length_rm_years",  H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="([ ]*+[a-zA-Z].*)|(n/a)", replacement="")),  # remove " year" and " years", also translate n/a to ""
        ("emp_length_trim",      H2OColOp(op=H2OFrame.trim, col="emp_length", inplace=True)),                                                     # trim all the WS off
        ("emp_length_lt1_point5",H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="< 1",    replacement="0.5")),                # translate < 1 => 0.5
        ("emp_length_10plus",    H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="10\\+",    replacement="10")),               # translate 10+ to 10
        ("emp_length_as_numeric",H2OColOp(op=H2OFrame.asnumeric, col="emp_length", inplace=True)),                                                # string -> double

        # compute credit length
        ("credit_length", H2OBinaryOp(op=H2OAssembly.minus, col="issue_d_Year",inplace=False, new_col_name="longest_credit_length",right=H2OCol("earliest_cr_line_Year")))

      ])

    res = assembly.fit(data)
    pyunit_utils.javamunge(assembly, "AssemblyMungingDemoPojo", test)
    def setup_and_train(param_enabled=None):
        h2o.remove_all()
        target, train, _, _ = prepare_data()
        state = 'enabled' if param_enabled is True else 'disabled' if param_enabled is False else 'default'
        if param_enabled is None:
            aml = H2OAutoML(project_name='keep_cross_validation_predictions_'+state,
                            nfolds=nfolds, max_models=3, seed=1)
        else:
            aml = H2OAutoML(project_name='keep_cross_validation_predictions_'+state,
                            nfolds=nfolds, max_models=8, seed=1,
                            keep_cross_validation_predictions=param_enabled)

        aml.train(y=target, training_frame=train)
        # print(aml.leaderboard)
        return aml
def run_test(sys_args, test_to_run):
    global _IPYNB_
    parse_args(sys_args)
    h2o.init(ip=_H2O_IP_, port=_H2O_PORT_, strict_version_check=False)
    h2o.log_and_echo("------------------------------------------------------------")
    h2o.log_and_echo("")
    h2o.log_and_echo("STARTING TEST: "+str(h2o.ou()))
    h2o.log_and_echo("")
    h2o.log_and_echo("------------------------------------------------------------")
    # num_keys = h2o.store_size()
    try:
        if _IPYNB_: utils.ipy_notebook_exec(_IPYNB_, save_and_norun=False)
        else: test_to_run()
    finally:
        h2o.remove_all()
Esempio n. 41
0
def h2o_test_setup(sys_args):
    h2o_py_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),".."))
    h2o_docs_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","h2o-docs"))

    parse_args(sys_args)

    sys.path.insert(1, h2o_py_dir)
    import h2o
    from tests import pyunit_utils, pydemo_utils, pybooklet_utils

    for pkg in (pyunit_utils, pybooklet_utils):
        setattr(pkg, '__on_hadoop__', _ON_HADOOP_)
        setattr(pkg, '__hadoop_namenode__', _HADOOP_NAMENODE_)
        setattr(pkg, '__test_name__', _TEST_NAME_)
        setattr(pkg, '__results_dir__', _RESULTS_DIR_)

    if _IS_PYUNIT_ or _IS_IPYNB_ or _IS_PYBOOKLET_ or _IS_PYDEMO_:
        pass
    else:
        raise(EnvironmentError, "Unrecognized test type. Must be of type ipynb, pydemo, pyunit, or pybooklet, but got: "
                                "{0}".format(_TEST_NAME_))

    print("[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Connect to h2o on IP: {0} PORT: {1}".format(_H2O_IP_, _H2O_PORT_)))
    auth = None
    if _LDAP_USER_NAME_ is not None and _LDAP_PASSWORD_ is not None:
        auth = (_LDAP_USER_NAME_, _LDAP_PASSWORD_)
    elif _KERB_PRINCIPAL_ is not None:
        from h2o.auth import SpnegoAuth
        auth = SpnegoAuth(service_principal=_KERB_PRINCIPAL_)
    h2o.connect(ip=_H2O_IP_, port=_H2O_PORT_, verbose=False, auth=auth, **_H2O_EXTRA_CONNECT_ARGS_)
    h2o.utils.config.H2OConfigReader.get_config()["general.allow_breaking_changes"] = True

    #rest_log = os.path.join(_RESULTS_DIR_, "rest.log")
    #h2o.start_logging(rest_log)
    #print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Started rest logging in: {0}".format(rest_log))

    h2o.log_and_echo("------------------------------------------------------------")
    h2o.log_and_echo("")
    h2o.log_and_echo("STARTING TEST: " + _TEST_NAME_)
    h2o.log_and_echo("")
    h2o.log_and_echo("------------------------------------------------------------")

    h2o.remove_all()

    if _IS_IPYNB_:       pydemo_utils.ipy_notebook_exec(_TEST_NAME_)
    elif _IS_PYUNIT_:    pyunit_utils.pyunit_exec(_TEST_NAME_)
    elif _IS_PYBOOKLET_: pybooklet_utils.pybooklet_exec(_TEST_NAME_)
    elif _IS_PYDEMO_:    pydemo_utils.pydemo_exec(_TEST_NAME_)
Esempio n. 42
0
def sort():
    df = h2o.create_frame(rows=10,
                          cols=3,
                          factors=10,
                          categorical_fraction=1.0/3,
                          time_fraction=1.0/3,
                          real_fraction=1.0/3,
                          real_range=100,
                          missing_fraction=0.0,
                          seed=123)
    df1 = df.sort("C1")
    assert df1[0,0] == 433225652950 # 1983-09-24 04:27:32
    assert df1[9,0] == 1532907020199 # 2018-07-29 23:30:20
    df2 = df.sort("C2")
    assert df2[0,1] == "c1.l1"
    assert df2[9,1] == "c1.l9"
    h2o.remove_all()
    def start_h2o(self, thread_count=-1, gb_ram_count=26):
        """Initializes a connection to H2O instance and clear it out, if needed.

        :param thread_count: the number of threads that H2O may use or -1 if all available  (Default value = -1)
        :param gb_ram_count: the number of gigabytes of RAM that H2O may use (Default value = 26)
        """
        h2o.init(nthreads=thread_count, max_mem_size=gb_ram_count)
        # clear out cluster
        return(h2o.remove_all())
Esempio n. 44
0
def run_test(sys_args, test_to_run):
    # import pkg_resources
    # ver = pkg_resources.get_distribution("h2o").version
    # print "H2O PYTHON PACKAGE VERSION: " + str(ver)
    ip, port = sys_args[2].split(":")
    h2o.init(ip,port,strict_version_check=False)
    h2o.log_and_echo("------------------------------------------------------------")
    h2o.log_and_echo("")
    h2o.log_and_echo("STARTING TEST: "+str(h2o.ou()))
    h2o.log_and_echo("")
    h2o.log_and_echo("------------------------------------------------------------")
    num_keys = h2o.store_size()
    try:
        if len(sys_args) > 3 and sys_args[3] == "--ipynb": utils.ipy_notebook_exec(sys_args[4],save_and_norun=False)
        else: test_to_run(ip, port)
    finally:
        h2o.remove_all()
        if h2o.keys_leaked(num_keys): print "Leaked Keys!"
def deepwater_lenet():
  if not H2ODeepWaterEstimator.available(): return

  frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  print(frame.head(5))
  model = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-3, network='lenet', score_interval=0, train_samples_per_iteration=1000)
  model.train(x=[0],y=1, training_frame=frame)

  extracted = model.deepfeatures(frame, "pooling1_output")
  #print(extracted.describe())
  print(extracted.ncols)
  assert extracted.ncols == 800, "extracted frame doesn't have 800 columns"

  extracted = model.deepfeatures(frame, "activation2_output")
  #print(extracted.describe())
  print(extracted.ncols)
  assert extracted.ncols == 500, "extracted frame doesn't have 500 columns"

  h2o.remove_all()
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    df = pyunit_utils.random_dataset("regression", seed=1234)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict

    h2o.save_model(glrmModel, TMPDIR)   # save GLRM model
    glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME))
    predict_model = glrmModel2.predict(test)
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
            predict_model[col] = predict_model[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
    print("Comparing mojo predict and h2o predict from saved model...")
    pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10)
    frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor
    glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID)   # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
Esempio n. 47
0
def h2o_test_setup(sys_args):
    h2o_py_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),".."))
    h2o_docs_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","h2o-docs"))

    parse_args(sys_args)

    sys.path.insert(1, h2o_py_dir)
    import h2o
    from tests import pyunit_utils, pydemo_utils, pybooklet_utils

    set_pyunit_pkg_attrs(pyunit_utils)
    set_pybooklet_pkg_attrs(pybooklet_utils)

    if _IS_PYUNIT_ or _IS_IPYNB_ or _IS_PYBOOKLET_:
        pass
    elif _IS_PYDEMO_:
        raise(NotImplementedError, "pydemos are not supported at this time")
    else:
        raise(EnvironmentError, "Unrecognized test type. Must be of type ipynb, pydemo, pyunit, or pybooklet, but got: "
                                "{0}".format(_TEST_NAME_))

    print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Connect to h2o on IP: {0} PORT: {1}"
                                                                        "".format(_H2O_IP_, _H2O_PORT_))
    h2o.init(ip=_H2O_IP_, port=_H2O_PORT_, strict_version_check=False)

    #rest_log = os.path.join(_RESULTS_DIR_, "rest.log")
    #h2o.start_logging(rest_log)
    #print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Started rest logging in: {0}".format(rest_log))

    h2o.log_and_echo("------------------------------------------------------------")
    h2o.log_and_echo("")
    h2o.log_and_echo("STARTING TEST: " + _TEST_NAME_)
    h2o.log_and_echo("")
    h2o.log_and_echo("------------------------------------------------------------")

    h2o.remove_all()

    if _IS_IPYNB_:       pydemo_utils.ipy_notebook_exec(_TEST_NAME_)
    elif _IS_PYUNIT_:    pyunit_utils.pyunit_exec(_TEST_NAME_)
    elif _IS_PYBOOKLET_: pybooklet_utils.pybooklet_exec(_TEST_NAME_)
def glm_binomial_mojo_pojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    PROBLEM="binomial"
    params = set_params()   # set deeplearning model parameters
    df = pyunit_utils.random_dataset(PROBLEM)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    glmBinomialModel = pyunit_utils.build_save_model_GLM(params, x, train, "response") # build and save mojo model

    MOJONAME = pyunit_utils.getMojoName(glmBinomialModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))

    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glmBinomialModel, TMPDIR, MOJONAME)  # load model and perform predict
    h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv"))
    pred_pojo = pyunit_utils.pojo_predict(glmBinomialModel, TMPDIR, MOJONAME)
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10)    # make sure operation sequence is preserved from Tomk        h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True)  # save model for debugging
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def pca_wideDataset_rotterdam():
    h2o.remove_all()
    print("Importing Rotterdam.csv data...")
    rotterdamH2O = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip"))
    y = set(["relapse"])
    x = list(set(rotterdamH2O.names)-y)

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]
    print("transform used on dataset is {0}.\n".format(transformN))
    buildModel = [False, False, False]
    buildModel[randint(0, len(buildModel)-1)] = True

    expNum = 0
    if (buildModel[expNum]):
        # special test with GLRM.  Need use_all_levels to be true
        print("------  Testing GLRM PCA --------")
        gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345, use_all_factor_levels=True)
        gramSVD.train(x=x, training_frame=rotterdamH2O)

        glrmPCA = H2OGeneralizedLowRankEstimator(k=8, transform=transformN, seed=12345, init="Random",
                                                 max_iterations=10, recover_svd=True, regularization_x="None",
                                                 regularization_y="None")
        glrmPCA.train(x=x, training_frame=rotterdamH2O)

        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvectors between GramSVD and GLRM...\n")
        print("@@@@@@  Comparing eigenvalues between GramSVD and GLRM...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                                 glrmPCA._model_json["output"]["importance"],
                                                 ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                                 tolerance=1, check_all=False)

        # compare singular vectors
        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                                 glrmPCA._model_json["output"]["eigenvectors"],
                                                 glrmPCA._model_json["output"]["names"], tolerance=1e-6,
                                                 check_sign=True, check_all=False)
        h2o.remove(gramSVD)
        h2o.remove(glrmPCA)

    expNum=expNum+1
    if (buildModel[expNum]):
        print("------  Testing Power PCA --------")
        gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345)
        gramSVD.train(x=x, training_frame=rotterdamH2O)
        powerPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Power", seed=12345)  # power
        powerPCA.train(x=x, training_frame=rotterdamH2O)
        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvalues between GramSVD and Power...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                             powerPCA._model_json["output"]["importance"],
                                             ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                             tolerance=1e-6, check_all=False)
        print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
        # compare singular vectors

        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                             powerPCA._model_json["output"]["eigenvectors"],
                                             powerPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True,
                                             check_all=False)

    expNum=expNum+1
    if (buildModel[expNum]):
        print("------  Testing Randomized PCA --------")
        gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345)
        gramSVD.train(x=x, training_frame=rotterdamH2O)
        randomizedPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Randomized", seed=12345,
                               max_iterations=5)  # power
        randomizedPCA.train(x=x, training_frame=rotterdamH2O)

        # compare singular values and stuff with GramSVD
        print("@@@@@@  Comparing eigenvalues between GramSVD and Randomized...\n")
        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"],
                                                 randomizedPCA._model_json["output"]["importance"],
                                                 ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"],
                                                 tolerance=1e-1, check_all=False)

        print("@@@@@@  Comparing eigenvectors between GramSVD and Power...\n")
        # compare singular vectors
        pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"],
                                                 randomizedPCA._model_json["output"]["eigenvectors"],
                                                 randomizedPCA._model_json["output"]["names"], tolerance=1e-6,
                                                 check_sign=True, check_all=False)
    h2o.remove_all()
def setup_dataset():
    h2o.remove_all()
    train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    return train
Esempio n. 51
0
# coding: utf-8

# In[ ]:

import h2o
import numpy as np
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator
import os


# In[ ]:

h2o.init()
h2o.remove_all() # Clean slate - just in case the cluster was already running


# In[ ]:

from h2o.h2o import _locate # private function. used to find files within h2o git project directory.

# Import walking gait data
gait = h2o.import_file(path=os.path.realpath("../data/subject01_walk1.csv"))
gait.describe()


# In[ ]:

# Plot first row of data on x- vs. y-coordinate features
gait_row = gait[1,:].drop("Time")
def s3timings(ip, port):
  t = time.time()
  # connect to cluster
  h2o.init(ip, port)

  # defining timers
  air_run = timeit.Timer(stmt = 'h2o.import_frame("s3n://h2o-airlines-unpacked/allyears.1987.2013.csv")',
    setup = 'import h2o')
  bigx_run = timeit.Timer(stmt = 'h2o.import_frame("s3://h2o-public-test-data/bigdata/server/flow-tests/BigCross.data")',
    setup = 'import h2o')
  higg_run = timeit.Timer(stmt = 'h2o.import_frame("s3://h2o-public-test-data/bigdata/server/HIGGS.csv")',
    setup = 'import h2o')
  citi_run = timeit.Timer(stmt = 'h2o.import_frame(path = big_citi)',
    setup = 'import h2o;\
             big_citi = ["s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-07.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-08.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-09.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-10.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-11.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-12.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-01.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-02.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-03.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-04.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-05.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-06.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-07.csv",\
                          "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-08.csv"]')
  mils_run = timeit.Timer(stmt = 'h2o.import_frame(path = mill_songs)',
    setup = 'import h2o;\
             mill_songs = ["s3://h2o-public-test-data/bigdata/server/milsongs/milsongs-test.csv",\
                           "s3://h2o-public-test-data/bigdata/server/milsongs/milsongs-train.csv"]')
  cup_run = timeit.Timer(stmt = 'h2o.import_frame(path = cup98)',
    setup = 'import h2o;\
             cup98 = ["s3://h2o-public-test-data/bigdata/laptop/usecases/cup98LRN_z.csv",\
                      "s3://h2o-public-test-data/bigdata/laptop/usecases/cup98VAL_z.csv"]')
  mnist_run = timeit.Timer(stmt  = 'h2o.import_frame(path = mnist)',
    setup = 'import h2o;\
             mnist = ["s3://h2o-public-test-data/bigdata/laptop/mnist/test.csv.gz",\
                      "s3://h2o-public-test-data/bigdata/laptop/mnist/train.csv.gz"]')
  arc_run = timeit.Timer(stmt = 'h2o.import_frame(path = arcene)',
    setup = 'import h2o;\
             arcene = ["s3://h2o-public-test-data/smalldata/arcene/arcene_test.data",\
                       "s3://h2o-public-test-data/smalldata/arcene/arcene_train.data",\
                       "s3://h2o-public-test-data/smalldata/arcene/arcene_valid.data"]')

  # Running with timers
  air_first   =   air_run.timeit(number=1)
  bigx_first  =  bigx_run.timeit(number=1)
  higg_first  =  higg_run.timeit(number=1)
  citi_first  =  citi_run.timeit(number=1)
  mils_first  =  mils_run.timeit(number=1)
  cup_first   =   cup_run.timeit(number=1)
  mnist_first = mnist_run.timeit(number=1)
  arc_first   =   arc_run.timeit(number=1)

  # Clear kvstore and run again
  s = time.time()
  h2o.remove_all()
  print "Elapsed Time for RemoveAll: " + str(time.time() - s) + " (s)."
  air_second   =   air_run.timeit(number=1)
  bigx_second  =  bigx_run.timeit(number=1)
  higg_second  =  higg_run.timeit(number=1)
  citi_second  =  citi_run.timeit(number=1)
  mils_second  =  mils_run.timeit(number=1)
  cup_second   =   cup_run.timeit(number=1)
  mnist_second = mnist_run.timeit(number=1)
  arc_second   =   arc_run.timeit(number=1)
  print("Airlines: " + str(air_first) + " vs " + str(air_second))
  print("BigCross: " + str(bigx_first) + " vs " + str(bigx_second))
  print("Higgs: " + str(higg_first) + " vs " + str(higg_second))
  print("Citi_bikes: " + str(citi_first) + " vs " + str(citi_second))
  print("Million Songs: " + str(mils_first) + " vs " + str(mils_second))
  print("KDD Cup98: " + str(cup_first) + " vs " + str(cup_second))
  print("Mnist: " + str(mnist_first) + " vs " + str(mnist_second))
  print("Arcene: " + str(arc_first) + " vs " + str(arc_second))
  s = time.time()
  h2o.remove_all()
  print "Elapsed Time for RemoveAll: " + str(time.time() - s) + " (s)."
  print "Exiting scope... Test elapsed time: " + str(time.time() - t) + " (s)."
# 
# Load the H2O Python module.

# In[ ]:

import h2o
import os


# ### Start H2O
# Start up a 1-node H2O cloud on your local machine, and allow it to use all CPU cores and up to 2GB of memory:

# In[ ]:

h2o.init(max_mem_size_GB = 2)            #uses all cores by default
h2o.remove_all()                          #clean slate, in case cluster was already running


# To learn more about the h2o package itself, we can use Python's builtin help() function.

# In[ ]:

help(h2o)


# help() can be used on H2O functions and models. Jupyter's builtin shift-tab functionality also works

# In[ ]:

from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator