Exemple #1
0
def test_gbm_bulk_cv():
    response = "survived"
    titanic = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/titanic.csv"))
    titanic[response] = titanic[response].asfactor()
    predictors = ["survived", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin"]
    train, valid = titanic.split_frame(ratios=[.8], seed=1234)
    titanic_gbm = H2OGradientBoostingEstimator(seed=1234, nfolds=2, build_tree_one_node=True)
    titanic_gbm.train_segments(segments=["pclass"],
                               x=predictors,
                               y=response,
                               training_frame=train,
                               validation_frame=valid,
                               segment_models_id="titanic_by_pclass")

    train_cl1 = train[train["pclass"] == 1]
    valid_cl1 = valid[valid["pclass"] == 1]
    titanic_cl1_gbm = H2OGradientBoostingEstimator(seed=1234, nfolds=2)
    titanic_cl1_gbm.train(x=predictors,
                          y=response,
                          training_frame=train_cl1,
                          validation_frame=valid_cl1)

    titanic_models = H2OSegmentModels(segment_models_id="titanic_by_pclass")
    bulk_models = titanic_models.as_frame()
    titanic_bulk_cl1_gbm_id = (bulk_models[bulk_models["pclass"] == 1]["model"])
    titanic_bulk_cl1_gbm = h2o.get_model(titanic_bulk_cl1_gbm_id.flatten())

    pyunit_utils.check_models(titanic_cl1_gbm, titanic_bulk_cl1_gbm, use_cross_validation=True)
Exemple #2
0
def demo_xval_with_validation_frame():
    prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    prostate[1] = prostate[1].asfactor()
    print(prostate.summary())

    # invert the response
    prostate_inverse = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    resp = (1 - prostate_inverse[1])
    prostate_inverse[1] = resp.asfactor()
    print(prostate_inverse.summary())

    # 50 is a default but lets be explict
    ntrees = 50
    X = list(range(2,9))
    y = 1

    # 1. Train a model with 5-fold xval, no validation frame
    prostate_gbm = H2OGradientBoostingEstimator(nfolds=5,
                                                ntrees=ntrees,
                                                distribution="bernoulli",
                                                seed=1,
                                                score_each_iteration=True,
                                                stopping_rounds=3)
    prostate_gbm.train(x=X, y=y, training_frame=prostate)
    prostate_gbm.show()
    # stopped early
    assert get_ntrees(prostate_gbm) < ntrees

    # 2. Show that training a model without xval with inverted validation frame triggers early stopping earlier
    # Validation frame contradicts the training frame and training should stop right away
    prostate_gbm_noxval = H2OGradientBoostingEstimator(ntrees=ntrees,
                                                       distribution="bernoulli",
                                                       seed=1,
                                                       score_each_iteration=True,
                                                       stopping_rounds=3)
    prostate_gbm_noxval.train(x=X, y=y, training_frame=prostate, validation_frame=prostate_inverse)
    prostate_gbm_noxval.show()
    # stopped almost immediately
    assert get_ntrees(prostate_gbm_noxval) == 6
    # earlier then in case of (1)
    assert get_ntrees(prostate_gbm_noxval) < get_ntrees(prostate_gbm)

    # 3. Train a model with 5-fold xval this time with inverted frame as the validation frame
    prostate_gbm_v = H2OGradientBoostingEstimator(nfolds=5,
                                                  ntrees=ntrees,
                                                  distribution="bernoulli",
                                                  seed=1,
                                                  score_each_iteration=True,
                                                  stopping_rounds=3)
    prostate_gbm_v.train(x=X, y=y, training_frame=prostate, validation_frame=prostate_inverse)
    prostate_gbm_v.show()

    # Models (1) and (3) are the same => validation cannot be used for early stopping in xval setting
    # Otherwise we would have stopped earlier as we see in (2)
    pyunit_utils.check_models(prostate_gbm, prostate_gbm_v)
    # Stopped early with same number of trees build as in (1)
    assert get_ntrees(prostate_gbm) == get_ntrees(prostate_gbm_v)
Exemple #3
0
 def model_comparator(frame1, frame2, col_ind, rows1, numElements):
     assert numElements == 0
     models1 = frame1[col_ind].as_data_frame()
     models2 = frame2[col_ind].as_data_frame()
     for i in range(rows1):
         model_id_1 = str(models1.iloc[i][0])
         model_1 = h2o.get_model(model_id_1)
         model_id_2 = str(models2.iloc[i][0])
         model_2 = h2o.get_model(model_id_2)
         print("###### Comparing model {0} and model {1}.".format(
             model_1.model_id, model_2.model_id))
         pyunit_utils.check_models(model_1, model_2)
Exemple #4
0
def test_if_train_segments():
    prostate = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()

    # model will be built for each segment
    segment_col = "RACE"
    # segment 0 is too small, will not produce a model
    bad_segment = 0

    segments = prostate[segment_col].unique()
    segments.rename({'C1': segment_col})

    params = {"min_rows": 2, "ntrees": 4, "seed": 42, "sample_size": 10}
    prostate_if = H2OIsolationForestEstimator(**params)
    models = prostate_if.train_segments(ignored_columns=["ID"],
                                        training_frame=prostate,
                                        segments=segments)
    models_list = models.as_frame()
    print(models_list)

    assert models_list.names == [
        u'RACE', u'model', u'status', u'errors', u'warnings'
    ]
    assert models_list.nrow == 3

    # Check failed models
    expected_error = 'ERRR on field: _min_rows: The dataset size is too small to split for min_rows=2.0: ' \
                     'must have at least 4.0 (weighted) rows, but have only 3.0.\n'
    assert (models_list["errors"][models_list[segment_col] == bad_segment]
            ).as_data_frame()["errors"][0] == expected_error

    mp = models_list.as_data_frame()
    # Check built models
    for i in range(mp.shape[0]):
        segment = int(mp.iloc[i][segment_col])
        status = str(mp.iloc[i]["status"])
        if segment != bad_segment:
            assert status == "SUCCEEDED"
            model_id = mp.iloc[i]["model"]
            model = h2o.get_model(model_id)
            prostate_segment = prostate[prostate[segment_col] == segment]
            prostate_if_segment = H2OIsolationForestEstimator(**params)
            prostate_if_segment.train(ignored_columns=["ID", segment_col],
                                      training_frame=prostate_segment)
            pyunit_utils.check_models(model, prostate_if_segment)
            preds_actual = model.predict(prostate_segment)
            preds_expected = prostate_if_segment.predict(prostate_segment)
            assert_frame_equal(preds_actual.as_data_frame(True),
                               preds_expected.as_data_frame(True))
        else:
            assert status == "FAILED"
Exemple #5
0
def test_gbm_bulk_train():
    prostate = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()

    # model will be built for each segment
    segment_col = "RACE"
    # segment 0 is too small, will not produce a model
    bad_segment = 0

    segments = prostate[segment_col].unique()
    segments.rename({'C1': segment_col})

    params = {"min_rows": 2, "ntrees": 4, "seed": 42}
    prostate_gbm = H2OGradientBoostingEstimator(**params)
    models = prostate_gbm.bulk_train(y="CAPSULE",
                                     ignored_columns=["ID"],
                                     training_frame=prostate,
                                     segments=segments)
    models_list = models.as_frame()

    assert models_list.names == [
        u'RACE', u'Status', u'Model', u'Errors', u'Warnings'
    ]
    assert models_list.nrow == 3

    # Check failed models
    expected_error = 'ERRR on field: _min_rows: The dataset size is too small to split for min_rows=2.0: ' \
                     'must have at least 4.0 (weighted) rows, but have only 3.0.\n'
    assert (models_list["Errors"][models_list[segment_col] == bad_segment]
            ).as_data_frame()["Errors"][0] == expected_error

    mp = models_list.as_data_frame()
    # Check built models
    for i in range(mp.shape[0]):
        segment = int(mp.iloc[i][segment_col])
        if segment != bad_segment:
            model_id = mp.iloc[i]["Model"]
            model = h2o.get_model(model_id)
            prostate_segment = prostate[prostate[segment_col] == segment]
            prostate_gbm_segment = H2OGradientBoostingEstimator(**params)
            prostate_gbm_segment.train(y="CAPSULE",
                                       ignored_columns=["ID"],
                                       training_frame=prostate_segment)
            pyunit_utils.check_models(model, prostate_gbm_segment)
Exemple #6
0
def cv_carsRF():

  # read in the dataset and construct training set (and validation set)
  cars =  h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))

  # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
  # 2:multinomial
  problem = random.sample(list(range(3)),1)[0]
  problem = 2
  # pick the predictors and the correct response column
  predictors = ["displacement","power","weight","acceleration","year"]
  if problem == 1   :
    response_col = "economy_20mpg"
    cars[response_col] = cars[response_col].asfactor()
  elif problem == 2 :
    response_col = "cylinders"
    cars[response_col] = cars[response_col].asfactor()
  else              :
    response_col = "economy"

  print("Response column: {0}".format(response_col))




## cross-validation
  # 1. check that cv metrics are the same over repeated seeded "Modulo" runs
  nfolds = random.randint(3,10)
  rf1 = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Modulo", seed=1234)
  rf1.train(x=predictors, y=response_col, training_frame=cars)
  rf2 = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Modulo", seed=1234)
  rf2.train(x=predictors, y=response_col, training_frame=cars)
  pyunit_utils.check_models(rf1, rf2, True)

  # 2. check that cv metrics are different over repeated "Random" runs
  nfolds = random.randint(3,10)
  rf1 = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Random")
  rf1.train(x=predictors, y=response_col, training_frame=cars)
  rf2 = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Random")
  rf2.train(x=predictors, y=response_col, training_frame=cars)
  try:
    pyunit_utils.check_models(rf1, rf2, True)
    assert False, "Expected models to be different over repeated Random runs"
  except AssertionError:
    assert True

  # 3. folds_column
  num_folds = random.randint(2,5)
  fold_assignments = h2o.H2OFrame([[random.randint(0,num_folds-1)] for f in range(cars.nrow)])
  fold_assignments.set_names(["fold_assignments"])
  cars = cars.cbind(fold_assignments)
  rf = H2ORandomForestEstimator(keep_cross_validation_predictions=True)
  rf.train(y=response_col, x=predictors, training_frame=cars, fold_column="fold_assignments")

  num_cv_models = len(rf._model_json['output']['cross_validation_models'])
  assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                   "{1}".format(num_folds, num_cv_models)
  cv_model1 = h2o.get_model(rf._model_json['output']['cross_validation_models'][0]['name'])
  cv_model2 = h2o.get_model(rf._model_json['output']['cross_validation_models'][1]['name'])

  # 4. keep_cross_validation_predictions
  cv_predictions = rf1._model_json['output']['cross_validation_predictions']
  assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions)

  cv_predictions = rf._model_json['output']['cross_validation_predictions']
  assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                         "as folds, but got {0}".format(len(cv_predictions))


  ## boundary cases
  # 1. nfolds = number of observations (leave-one-out cross-validation)
  rf = H2ORandomForestEstimator(nfolds=cars.nrow, fold_assignment="Modulo")
  rf.train(y=response_col, x=predictors, training_frame=cars)

  # 2. nfolds = 0
  rf1 = H2ORandomForestEstimator(nfolds=0, seed=1234)
  rf1.train(y=response_col, x=predictors, training_frame=cars)

  # check that this is equivalent to no nfolds
  rf2 = H2ORandomForestEstimator(seed=1234)
  rf2.train(y=response_col, x=predictors, training_frame=cars)
  pyunit_utils.check_models(rf1, rf2)

  # 3. cross-validation and regular validation attempted
  rf = H2ORandomForestEstimator(nfolds=random.randint(3,10))
  rf.train(y=response_col, x=predictors, training_frame=cars, validation_frame=cars)


  ## error cases
  # 1. nfolds == 1 or < 0
  try:
    rf = H2ORandomForestEstimator(nfolds=random.sample([-1,1], 1)[0])
    rf.train(y=response_col, x=predictors, training_frame=cars)
    assert False, "Expected model-build to fail when nfolds is 1 or < 0"
  except EnvironmentError:
    assert True

  # 2. more folds than observations
  try:
    rf = H2ORandomForestEstimator(nfolds=cars.nrow+1, fold_assignment="Modulo")
    rf.train(y=response_col, x=predictors, training_frame=cars)
    assert False, "Expected model-build to fail when nfolds > nobs"
  except EnvironmentError:
    assert True

  # 3. fold_column and nfolds both specified
  try:
    rf = H2ORandomForestEstimator(nfolds=3)
    rf.train(y=response_col, x=predictors, fold_column="fold_assignments", training_frame=cars)
    assert False, "Expected model-build to fail when fold_column and nfolds both specified"
  except EnvironmentError:
    assert True
def cv_cars_glm():

  # read in the dataset and construct training set (and validation set)
  cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))

  # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
  # 2:poisson
  problem = random.sample(list(range(3)),1)[0]
  # pick the predictors and response column, along with the correct family
  predictors = ["displacement","power","weight","acceleration","year"]
  if problem == 1   :
    response_col = "economy_20mpg"
    family = "binomial"
    cars[response_col] = cars[response_col].asfactor()
  elif problem == 2 :
    family = "poisson"
    response_col = "cylinders"
  else              :
    family = "gaussian"
    response_col = "economy"

  print("Distribution: {0}".format(family))
  print("Response column: {0}".format(response_col))

  ## cross-validation
  # 1. check that cv metrics are the same over repeated "Modulo" runs
  nfolds = random.randint(3,10)
  glm1 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Modulo")
  glm1.train(x=predictors, y=response_col, training_frame=cars)

  glm2 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Modulo")
  glm2.train(x=predictors, y=response_col, training_frame=cars)
  pyunit_utils.check_models(glm1, glm2, True)

  # 2. check that cv metrics are different over repeated "Random" runs
  nfolds = random.randint(3,10)
  glm1 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Random")
  glm1.train(x=predictors, y=response_col, training_frame=cars)

  glm2 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Random")
  glm2.train(x=predictors, y=response_col, training_frame=cars)
  try:
    pyunit_utils.check_models(glm1, glm2, True)
    assert False, "Expected models to be different over repeated Random runs"
  except AssertionError:
    assert True

  # 3. folds_column
  num_folds = random.randint(2,5)
  fold_assignments = h2o.H2OFrame([[random.randint(0,num_folds-1)] for f in range(cars.nrow)])
  fold_assignments.set_names(["fold_assignments"])
  cars = cars.cbind(fold_assignments)
  glm = H2OGeneralizedLinearEstimator(family=family, keep_cross_validation_predictions=True)
  glm.train(x=predictors, y=response_col, training_frame=cars, fold_column="fold_assignments")
  num_cv_models = len(glm._model_json['output']['cross_validation_models'])
  assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                   "{1}".format(num_folds, num_cv_models)
  cv_model1 = h2o.get_model(glm._model_json['output']['cross_validation_models'][0]['name'])
  cv_model2 = h2o.get_model(glm._model_json['output']['cross_validation_models'][1]['name'])

  # 4. keep_cross_validation_predictions
  cv_predictions = glm1._model_json['output']['cross_validation_predictions']
  assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions)

  cv_predictions = glm._model_json['output']['cross_validation_predictions']
  assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                         "as folds, but got {0}".format(len(cv_predictions))


  # 2. nfolds = 0
  glm1 = H2OGeneralizedLinearEstimator(nfolds=0, family=family)
  glm1.train(x=predictors, y=response_col, training_frame=cars)
  # check that this is equivalent to no nfolds
  glm2 = H2OGeneralizedLinearEstimator(family=family)
  glm2.train(x=predictors, y=response_col, training_frame=cars)
  pyunit_utils.check_models(glm1, glm2)

  # 3. cross-validation and regular validation attempted
  glm = H2OGeneralizedLinearEstimator(nfolds=random.randint(3,10), family=family)
  glm.train(x=predictors, y=response_col, training_frame=cars, validation_frame=cars)


  ## error cases
  # 1. nfolds == 1 or < 0
  try:
    glm = H2OGeneralizedLinearEstimator(nfolds=random.sample([-1,1], 1)[0], family=family)
    glm.train(x=predictors, y=response_col, training_frame=cars)
    assert False, "Expected model-build to fail when nfolds is 1 or < 0"
  except EnvironmentError:
    assert True

  # 2. more folds than observations
  try:
    glm = H2OGeneralizedLinearEstimator(nfolds=cars.nrow+1, family=family, fold_assignment="Modulo")
    glm.train(x=predictors, y=response_col, training_frame=cars)
    assert False, "Expected model-build to fail when nfolds > nobs"
  except EnvironmentError:
    assert True

  # 3. fold_column and nfolds both specified
  try:
    glm = H2OGeneralizedLinearEstimator(nfolds=3, family=family)
    glm.train(x=predictors, y=response_col, training_frame=cars, fold_column="fold_assignments")
    assert False, "Expected model-build to fail when fold_column and nfolds both specified"
  except EnvironmentError:
    assert True
def cv_carsDL():

    # read in the dataset and construct training set (and validation set)
    cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3), 1)[0]

    # pick the predictors and the correct response column
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == 1:
        response_col = "economy_20mpg"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2:
        response_col = "cylinders"
        cars[response_col] = cars[response_col].asfactor()
    else:
        response_col = "economy"

    print "Response column: {0}".format(response_col)

    ## cross-validation
    # 1. basic
    dl = h2o.deeplearning(
        y=cars[response_col], x=cars[predictors], nfolds=random.randint(3, 10), fold_assignment="Modulo"
    )

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3, 10)
    dl1 = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random")
    dl2 = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random")
    try:
        pyunit_utils.check_models(dl1, dl2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2, 5)
    fold_assignments = h2o.H2OFrame(python_obj=zip(*[[random.randint(0, num_folds - 1)] for f in range(cars.nrow)]))
    fold_assignments.set_names(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    dl = h2o.deeplearning(
        y=cars[response_col],
        x=cars[predictors],
        training_frame=cars,
        fold_column="fold_assignments",
        keep_cross_validation_predictions=True,
    )
    num_cv_models = len(dl._model_json["output"]["cross_validation_models"])
    assert num_cv_models == num_folds, "Expected {0} cross-validation models, but got " "{1}".format(
        num_folds, num_cv_models
    )
    cv_model1 = h2o.get_model(dl._model_json["output"]["cross_validation_models"][0]["name"])
    cv_model2 = h2o.get_model(dl._model_json["output"]["cross_validation_models"][1]["name"])
    assert isinstance(cv_model1, type(dl)), (
        "Expected cross-validation model to be the same model type as the "
        "constructed model, but got {0} and {1}".format(type(cv_model1), type(dl))
    )
    assert isinstance(cv_model2, type(dl)), (
        "Expected cross-validation model to be the same model type as the "
        "constructed model, but got {0} and {1}".format(type(cv_model2), type(dl))
    )

    # 4. keep_cross_validation_predictions
    cv_predictions = dl1._model_json["output"]["cross_validation_predictions"]
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(
        cv_predictions
    )

    cv_predictions = dl._model_json["output"]["cross_validation_predictions"]
    assert (
        len(cv_predictions) == num_folds
    ), "Expected the same number of cross-validation predictions " "as folds, but got {0}".format(len(cv_predictions))

    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow, fold_assignment="Modulo")

    # 2. nfolds = 0
    dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=0)

    # 3. cross-validation and regular validation attempted
    dl = h2o.deeplearning(
        y=cars[response_col],
        x=cars[predictors],
        nfolds=random.randint(3, 10),
        validation_y=cars[response_col],
        validation_x=cars[predictors],
    )

    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1, 1], 1)[0])
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow + 1, fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        rf = h2o.deeplearning(
            y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", training_frame=cars
        )
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True
def cv_cars_dw():
  if not H2ODeepWaterEstimator.available(): return

  # read in the dataset and construct training set (and validation set)
  cars =  h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))

  # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
  # 2:multinomial
  problem = random.sample(list(range(2)),1)[0] + 1  # only do classification

  # pick the predictors and the correct response column
  predictors = ["displacement","power","weight","acceleration","year"]
  if problem == 1   :
    response_col = "economy_20mpg"
    cars[response_col] = cars[response_col].asfactor()
  elif problem == 2 :
    response_col = "cylinders"
    cars[response_col] = cars[response_col].asfactor()
  else              :
    response_col = "economy"

  print("Response column: {0}".format(response_col))

  ## cross-validation
  # 1. basic

  dl = H2ODeepWaterEstimator(nfolds=random.randint(3,10),fold_assignment="Modulo",hidden=[20,20],epochs=10)
  dl.train(x=predictors, y=response_col, training_frame=cars)

  # 2. check that cv metrics are different over repeated "Random" runs
  nfolds = random.randint(3,10)
  dl1 = H2ODeepWaterEstimator(nfolds=nfolds,fold_assignment="Random",hidden=[20,20],epochs=10)
  dl1.train(x=predictors,y=response_col,training_frame=cars)
  dl2 = H2ODeepWaterEstimator(nfolds=nfolds,fold_assignment="Random",hidden=[20,20],epochs=10)
  try:
    pyunit_utils.check_models(dl1, dl2, True)
    assert False, "Expected models to be different over repeated Random runs"
  except AssertionError:
    assert True

  # 3. folds_column
  num_folds = random.randint(2,5)
  fold_assignments = h2o.H2OFrame([[random.randint(0,num_folds-1)] for _ in range(cars.nrow)])
  fold_assignments.set_names(["fold_assignments"])
  cars = cars.cbind(fold_assignments)

  dl = H2ODeepWaterEstimator(keep_cross_validation_predictions=True,hidden=[20,20],epochs=10)
  dl.train(x=predictors,y=response_col,training_frame=cars,fold_column="fold_assignments")

  num_cv_models = len(dl._model_json['output']['cross_validation_models'])
  assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                   "{1}".format(num_folds, num_cv_models)
  cv_model1 = h2o.get_model(dl._model_json['output']['cross_validation_models'][0]['name'])
  cv_model2 = h2o.get_model(dl._model_json['output']['cross_validation_models'][1]['name'])


  # 4. keep_cross_validation_predictions
  cv_predictions = dl1._model_json['output']['cross_validation_predictions']



  ## boundary cases
  # 1. nfolds = number of observations (leave-one-out cross-validation)
  dl = H2ODeepWaterEstimator(nfolds=cars.nrow, fold_assignment="Modulo",hidden=[20,20],epochs=10)
  dl.train(x=predictors,y=response_col,training_frame=cars)

  # 2. nfolds = 0
  dl = H2ODeepWaterEstimator(nfolds=0,hidden=[20,20],epochs=10)
  dl.train(x=predictors,y=response_col,training_frame=cars)

  # 3. cross-validation and regular validation attempted
  dl = H2ODeepWaterEstimator(nfolds=random.randint(3,10),hidden=[20,20],epochs=10)
  dl.train(x=predictors, y=response_col, training_frame=cars, validation_frame=cars)


  ## error cases
  # 1. nfolds == 1 or < 0
  try:
    dl = H2ODeepWaterEstimator(nfolds=random.sample([-1,1], 1)[0],hidden=[20,20],epochs=10)
    dl.train(x=predictors, y=response_col, training_frame=cars)
    assert False, "Expected model-build to fail when nfolds is 1 or < 0"
  except EnvironmentError:
    assert True

  # 2. more folds than observations
  try:
    dl = H2ODeepWaterEstimator(nfolds=cars.nrow+1,fold_assignment="Modulo",hidden=[20,20],epochs=10)
    dl.train(x=predictors, y=response_col, training_frame=cars)
    assert False, "Expected model-build to fail when nfolds > nobs"
  except EnvironmentError:
    assert True

  # 3. fold_column and nfolds both specified
  try:
    dl = H2ODeepWaterEstimator(nfolds=3, hidden=[20, 20], epochs=10)
    dl.train(x=predictors, y=response_col, fold_column="fold_assignments", training_frame=cars)
    assert False, "Expected model-build to fail when fold_column and nfolds both specified"
  except EnvironmentError:
    assert True
def cv_carsGBM():

    # read in the dataset and construct training set (and validation set)
    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(list(range(3)), 1)[0]

    # pick the predictors and response column, along with the correct distribution
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == 1:
        response_col = "economy_20mpg"
        distribution = "bernoulli"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2:
        response_col = "cylinders"
        distribution = "multinomial"
        cars[response_col] = cars[response_col].asfactor()
    else:
        response_col = "economy"
        distribution = "gaussian"

    print("Distribution: {0}".format(distribution))
    print("Response column: {0}".format(response_col))

    ## cross-validation
    # 1. check that cv metrics are the same over repeated "Modulo" runs
    nfolds = random.randint(3, 10)
    gbm1 = h2o.gbm(y=cars[response_col],
                   x=cars[predictors],
                   nfolds=nfolds,
                   distribution=distribution,
                   ntrees=5,
                   fold_assignment="Modulo")
    gbm2 = h2o.gbm(y=cars[response_col],
                   x=cars[predictors],
                   nfolds=nfolds,
                   distribution=distribution,
                   ntrees=5,
                   fold_assignment="Modulo")
    pyunit_utils.check_models(gbm1, gbm2, True)

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3, 10)
    gbm1 = h2o.gbm(y=cars[response_col],
                   x=cars[predictors],
                   nfolds=nfolds,
                   distribution=distribution,
                   ntrees=5,
                   fold_assignment="Random")
    gbm2 = h2o.gbm(y=cars[response_col],
                   x=cars[predictors],
                   nfolds=nfolds,
                   distribution=distribution,
                   ntrees=5,
                   fold_assignment="Random")
    try:
        pyunit_utils.check_models(gbm1, gbm2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2, 5)
    fold_assignments = h2o.H2OFrame([[random.randint(0, num_folds - 1)]
                                     for f in range(cars.nrow)])
    fold_assignments.set_names(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    gbm = h2o.gbm(y=cars[response_col],
                  x=cars[predictors],
                  training_frame=cars,
                  distribution=distribution,
                  ntrees=5,
                  fold_column="fold_assignments",
                  keep_cross_validation_predictions=True)
    num_cv_models = len(gbm._model_json['output']['cross_validation_models'])
    assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                                    "{1}".format(num_folds, num_cv_models)
    cv_model1 = h2o.get_model(
        gbm._model_json['output']['cross_validation_models'][0]['name'])
    cv_model2 = h2o.get_model(
        gbm._model_json['output']['cross_validation_models'][1]['name'])

    # 4. keep_cross_validation_predictions
    cv_predictions = gbm1._model_json['output']['cross_validation_predictions']
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(
        cv_predictions)

    cv_predictions = gbm._model_json['output']['cross_validation_predictions']
    assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                                          "as folds, but got {0}".format(len(cv_predictions))

    # # 5. manually construct models
    # fold1 = cars[cars["fold_assignments"]==0]
    # fold2 = cars[cars["fold_assignments"]==1]
    # manual_model1 = h2o.gbm(y=fold2[response_col],
    #                         x=fold2[predictors],
    #                         validation_y=fold1[response_col],
    #                         validation_x=fold1[predictors], ntrees=5,
    #                         distribution=distribution)
    # manual_model2 = h2o.gbm(y=fold1[response_col],
    #                         x=fold1[predictors],
    #                         validation_y=fold2[response_col],
    #                         validation_x=fold2[predictors], ntrees=5,
    #                         distribution=distribution)

    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    gbm = h2o.gbm(y=cars[response_col],
                  x=cars[predictors],
                  nfolds=cars.nrow,
                  distribution=distribution,
                  ntrees=5,
                  fold_assignment="Modulo")

    # 2. nfolds = 0
    gbm1 = h2o.gbm(y=cars[response_col],
                   x=cars[predictors],
                   nfolds=0,
                   distribution=distribution,
                   ntrees=5)
    # check that this is equivalent to no nfolds
    gbm2 = h2o.gbm(y=cars[response_col],
                   x=cars[predictors],
                   distribution=distribution,
                   ntrees=5)
    pyunit_utils.check_models(gbm1, gbm2)

    # 3. cross-validation and regular validation attempted
    gbm = h2o.gbm(y=cars[response_col],
                  x=cars[predictors],
                  nfolds=random.randint(3, 10),
                  validation_y=cars[response_col],
                  ntrees=5,
                  validation_x=cars[predictors],
                  distribution=distribution)

    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        gbm = h2o.gbm(y=cars[response_col],
                      x=cars[predictors],
                      nfolds=random.sample([-1, 1], 1)[0],
                      ntrees=5,
                      distribution=distribution)
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        gbm = h2o.gbm(y=cars[response_col],
                      x=cars[predictors],
                      nfolds=cars.nrow + 1,
                      distribution=distribution,
                      ntrees=5,
                      fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        gbm = h2o.gbm(y=cars[response_col],
                      x=cars[predictors],
                      nfolds=3,
                      fold_column="fold_assignments",
                      ntrees=5,
                      distribution=distribution,
                      training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True

    # 4. fold_column and fold_assignment both specified
    try:
        gbm = h2o.gbm(y=cars[response_col],
                      x=cars[predictors],
                      fold_assignment="Random",
                      fold_column="fold_assignments",
                      ntrees=5,
                      distribution=distribution,
                      training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and fold_assignment both specified"
    except EnvironmentError:
        assert True
def cv_cars_glm():

  # read in the dataset and construct training set (and validation set)
  cars = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))

  # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
  # 2:poisson
  problem = random.sample(range(3),1)[0]
  # pick the predictors and response column, along with the correct family
  predictors = ["displacement","power","weight","acceleration","year"]
  if problem == 1   :
    response_col = "economy_20mpg"
    family = "binomial"
    cars[response_col] = cars[response_col].asfactor()
  elif problem == 2 :
    family = "poisson"
    response_col = "cylinders"
  else              :
    family = "gaussian"
    response_col = "economy"

  print "Distribution: {0}".format(family)
  print "Response column: {0}".format(response_col)

  ## cross-validation
  # 1. check that cv metrics are the same over repeated "Modulo" runs
  nfolds = random.randint(3,10)
  glm1 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Modulo")
  glm1.train(x=predictors, y=response_col, training_frame=cars)

  glm2 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Modulo")
  glm2.train(x=predictors, y=response_col, training_frame=cars)
  pyunit_utils.check_models(glm1, glm2, True)

  # 2. check that cv metrics are different over repeated "Random" runs
  nfolds = random.randint(3,10)
  glm1 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Random")
  glm1.train(x=predictors, y=response_col, training_frame=cars)

  glm2 = H2OGeneralizedLinearEstimator(nfolds=nfolds, family=family, fold_assignment="Random")
  glm2.train(x=predictors, y=response_col, training_frame=cars)
  try:
    pyunit_utils.check_models(glm1, glm2, True)
    assert False, "Expected models to be different over repeated Random runs"
  except AssertionError:
    assert True

  # 3. folds_column
  num_folds = random.randint(2,5)
  fold_assignments = h2o.H2OFrame(python_obj=[[random.randint(0,num_folds-1) for f in range(cars.nrow)]])
  fold_assignments.set_names(["fold_assignments"])
  cars = cars.cbind(fold_assignments)
  glm = H2OGeneralizedLinearEstimator(family=family, keep_cross_validation_predictions=True)
  glm.train(x=predictors, y=response_col, training_frame=cars, fold_column="fold_assignments")
  num_cv_models = len(glm._model_json['output']['cross_validation_models'])
  assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                   "{1}".format(num_folds, num_cv_models)
  cv_model1 = h2o.get_model(glm._model_json['output']['cross_validation_models'][0]['name'])
  cv_model2 = h2o.get_model(glm._model_json['output']['cross_validation_models'][1]['name'])

  # 4. keep_cross_validation_predictions
  cv_predictions = glm1._model_json['output']['cross_validation_predictions']
  assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions)

  cv_predictions = glm._model_json['output']['cross_validation_predictions']
  assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                         "as folds, but got {0}".format(len(cv_predictions))


  # 2. nfolds = 0
  glm1 = H2OGeneralizedLinearEstimator(nfolds=0, family=family)
  glm1.train(x=predictors, y=response_col, training_frame=cars)
  # check that this is equivalent to no nfolds
  glm2 = H2OGeneralizedLinearEstimator(family=family)
  glm2.train(x=predictors, y=response_col, training_frame=cars)
  pyunit_utils.check_models(glm1, glm2)

  # 3. cross-validation and regular validation attempted
  glm = H2OGeneralizedLinearEstimator(nfolds=random.randint(3,10), family=family)
  glm.train(x=predictors, y=response_col, training_frame=cars, validation_frame=cars)


  ## error cases
  # 1. nfolds == 1 or < 0
  try:
    glm = H2OGeneralizedLinearEstimator(nfolds=random.sample([-1,1], 1)[0], family=family)
    glm.train(x=predictors, y=response_col, training_frame=cars)
    assert False, "Expected model-build to fail when nfolds is 1 or < 0"
  except EnvironmentError:
    assert True

  # 2. more folds than observations
  try:
    glm = H2OGeneralizedLinearEstimator(nfolds=cars.nrow+1, family=family, fold_assignment="Modulo")
    glm.train(x=predictors, y=response_col, training_frame=cars)
    assert False, "Expected model-build to fail when nfolds > nobs"
  except EnvironmentError:
    assert True

  # 3. fold_column and nfolds both specified
  try:
    glm = H2OGeneralizedLinearEstimator(nfolds=3, family=family)
    glm.train(x=predictors, y=response_col, training_frame=cars, fold_column="fold_assignments")
    assert False, "Expected model-build to fail when fold_column and nfolds both specified"
  except EnvironmentError:
    assert True
Exemple #12
0
def cv_carsGBM():

    # read in the dataset and construct training set (and validation set)
    cars =  h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3),1)[0]

    # pick the predictors and response column, along with the correct distribution
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        distribution = "bernoulli"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2 :
        response_col = "cylinders"
        distribution = "multinomial"
        cars[response_col] = cars[response_col].asfactor()
    else              :
        response_col = "economy"
        distribution = "gaussian"

    print "Distribution: {0}".format(distribution)
    print "Response column: {0}".format(response_col)

    ## cross-validation
    # 1. check that cv metrics are the same over repeated "Modulo" runs
    nfolds = random.randint(3,10)
    gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5,
                   fold_assignment="Modulo")
    gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5,
                   fold_assignment="Modulo")
    pyunit_utils.check_models(gbm1, gbm2, True)

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3,10)
    gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5,
                   fold_assignment="Random")
    gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5,
                   fold_assignment="Random")
    try:
        pyunit_utils.check_models(gbm1, gbm2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2,5)
    fold_assignments = h2o.H2OFrame(python_obj=[[random.randint(0,num_folds-1)] for f in range(cars.nrow)])
    fold_assignments.set_names(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], training_frame=cars, distribution=distribution, ntrees=5,
                  fold_column="fold_assignments", keep_cross_validation_predictions=True)
    num_cv_models = len(gbm._model_json['output']['cross_validation_models'])
    assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                                    "{1}".format(num_folds, num_cv_models)
    cv_model1 = h2o.get_model(gbm._model_json['output']['cross_validation_models'][0]['name'])
    cv_model2 = h2o.get_model(gbm._model_json['output']['cross_validation_models'][1]['name'])
    assert isinstance(cv_model1, type(gbm)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model1),type(gbm))
    assert isinstance(cv_model2, type(gbm)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model2),type(gbm))

    # 4. keep_cross_validation_predictions
    cv_predictions = gbm1._model_json['output']['cross_validation_predictions']
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions)

    cv_predictions = gbm._model_json['output']['cross_validation_predictions']
    assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                                          "as folds, but got {0}".format(len(cv_predictions))

    # # 5. manually construct models
    # fold1 = cars[cars["fold_assignments"]==0]
    # fold2 = cars[cars["fold_assignments"]==1]
    # manual_model1 = h2o.gbm(y=fold2[response_col],
    #                         x=fold2[predictors],
    #                         validation_y=fold1[response_col],
    #                         validation_x=fold1[predictors], ntrees=5,
    #                         distribution=distribution)
    # manual_model2 = h2o.gbm(y=fold1[response_col],
    #                         x=fold1[predictors],
    #                         validation_y=fold2[response_col],
    #                         validation_x=fold2[predictors], ntrees=5,
    #                         distribution=distribution)


    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow, distribution=distribution, ntrees=5,
                  fold_assignment="Modulo")

    # 2. nfolds = 0
    gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=0, distribution=distribution, ntrees=5)
    # check that this is equivalent to no nfolds
    gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], distribution=distribution, ntrees=5)
    pyunit_utils.check_models(gbm1, gbm2)

    # 3. cross-validation and regular validation attempted
    gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3,10), validation_y=cars[response_col], ntrees=5,
                  validation_x=cars[predictors], distribution=distribution)


    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1,1], 1)[0], ntrees=5,
                      distribution=distribution)
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow+1, distribution=distribution, ntrees=5,
                      fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", ntrees=5,
                      distribution=distribution, training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True

    # 4. fold_column and fold_assignment both specified
    try:
        gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], fold_assignment="Random", fold_column="fold_assignments", ntrees=5,
                      distribution=distribution, training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and fold_assignment both specified"
    except EnvironmentError:
        assert True
Exemple #13
0
def cv_carsDL():

    # read in the dataset and construct training set (and validation set)
    cars = h2o.import_file(
        path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3), 1)[0]

    # pick the predictors and the correct response column
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == 1:
        response_col = "economy_20mpg"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2:
        response_col = "cylinders"
        cars[response_col] = cars[response_col].asfactor()
    else:
        response_col = "economy"

    print "Response column: {0}".format(response_col)

    ## cross-validation
    # 1. basic
    dl = h2o.deeplearning(y=cars[response_col],
                          x=cars[predictors],
                          nfolds=random.randint(3, 10),
                          fold_assignment="Modulo")

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3, 10)
    dl1 = h2o.deeplearning(y=cars[response_col],
                           x=cars[predictors],
                           nfolds=nfolds,
                           fold_assignment="Random")
    dl2 = h2o.deeplearning(y=cars[response_col],
                           x=cars[predictors],
                           nfolds=nfolds,
                           fold_assignment="Random")
    try:
        pyunit_utils.check_models(dl1, dl2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2, 5)
    fold_assignments = h2o.H2OFrame(
        zip(*[[random.randint(0, num_folds - 1)] for f in range(cars.nrow)]))
    fold_assignments.set_names(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    dl = h2o.deeplearning(y=cars[response_col],
                          x=cars[predictors],
                          training_frame=cars,
                          fold_column="fold_assignments",
                          keep_cross_validation_predictions=True)
    num_cv_models = len(dl._model_json['output']['cross_validation_models'])
    assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                     "{1}".format(num_folds, num_cv_models)
    cv_model1 = h2o.get_model(
        dl._model_json['output']['cross_validation_models'][0]['name'])
    cv_model2 = h2o.get_model(
        dl._model_json['output']['cross_validation_models'][1]['name'])
    assert isinstance(cv_model1, type(dl)), "Expected cross-validation model to be the same model type as the " \
                                            "constructed model, but got {0} and {1}".format(type(cv_model1),type(dl))
    assert isinstance(cv_model2, type(dl)), "Expected cross-validation model to be the same model type as the " \
                                            "constructed model, but got {0} and {1}".format(type(cv_model2),type(dl))

    # 4. keep_cross_validation_predictions
    cv_predictions = dl1._model_json['output']['cross_validation_predictions']
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(
        cv_predictions)

    cv_predictions = dl._model_json['output']['cross_validation_predictions']
    assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                           "as folds, but got {0}".format(len(cv_predictions))

    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    dl = h2o.deeplearning(y=cars[response_col],
                          x=cars[predictors],
                          nfolds=cars.nrow,
                          fold_assignment="Modulo")

    # 2. nfolds = 0
    dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=0)

    # 3. cross-validation and regular validation attempted
    dl = h2o.deeplearning(y=cars[response_col],
                          x=cars[predictors],
                          nfolds=random.randint(3, 10),
                          validation_y=cars[response_col],
                          validation_x=cars[predictors])

    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        dl = h2o.deeplearning(y=cars[response_col],
                              x=cars[predictors],
                              nfolds=random.sample([-1, 1], 1)[0])
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        dl = h2o.deeplearning(y=cars[response_col],
                              x=cars[predictors],
                              nfolds=cars.nrow + 1,
                              fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        rf = h2o.deeplearning(y=cars[response_col],
                              x=cars[predictors],
                              nfolds=3,
                              fold_column="fold_assignments",
                              training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True