def cv_carsGLM(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    # read in the dataset and construct training set (and validation set)
    cars =  h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3),1)[0]

    # pick the predictors and the correct response column
    # TODO: add more families
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        family = "binomial"
        response_col = "economy_20mpg"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2 :
        family = "poisson"
        response_col = "cylinders"
    else              :
        family = "gaussian"
        response_col = "economy"

    print "Family: {0}".format(family)
    print "Response column: {0}".format(response_col)

    # TODO: add 'seed' to GLM. PUBDEV-1705
    ## cross-validation
    ## check that cv metrics are the same over (seeded) repeated runs
    nfolds = random.randint(3,10)
    glm1 = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, family=family, seed=1234)
    glm2 = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, family=family, seed=1234)
    h2o.check_models(glm1, glm2)

    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow(), family=family, seed=1234)
    # TODO: manually construct the cross-validation metrics and compare
    # TODO: PUBDEV-1697

    # 2. nfolds = 0
    glm1 = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=0, family=family)
    # check that this is equivalent to no nfolds
    glm2 = h2o.glm(y=cars[response_col], x=cars[predictors], family=family)
    h2o.check_models(glm1, glm2)

    # 3. more folds than observations equivalent to (seeded) leave-one-out
    glm3 = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow()+1, family=family, seed=1234)
    h2o.check_models(glm, glm3)

    ## error cases
    # 1. nfolds == 1 or < 0
    # TODO: PUBDEV-1696
    try:
        glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=random.randint(-10000,-1),
                      family=family)
        glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=1, family=family)
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. cross-validation and regular validation attempted
    r = cars[0].runif()
    train = cars[r > .2]
    valid = cars[r <= .2]
    try:
        glm = h2o.glm(y=train[response_col], x=train[predictors], nfolds=random.randint(3,10), validation_y=valid[1],
                      validation_x=valid[predictors], family=family)
        assert False, "Expected model-build to fail when both cross-validation and regular validation is attempted"
    except EnvironmentError:
        assert True
Beispiel #2
0
def cv_carsGBM(ip,port):

    # read in the dataset and construct training set (and validation set)
    cars =  h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3),1)[0]

    # pick the predictors and response column, along with the correct distribution
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        distribution = "bernoulli"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2 :
        response_col = "cylinders"
        distribution = "multinomial"
        cars[response_col] = cars[response_col].asfactor()
    else              :
        response_col = "economy"
        distribution = "gaussian"

    print "Distribution: {0}".format(distribution)
    print "Response column: {0}".format(response_col)

    ## cross-validation
    # 1. check that cv metrics are the same over repeated "Modulo" runs
    nfolds = random.randint(3,10)
    gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5,
                   fold_assignment="Modulo")
    gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5,
                   fold_assignment="Modulo")
    h2o.check_models(gbm1, gbm2, True)

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3,10)
    gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5,
                   fold_assignment="Random")
    gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, distribution=distribution, ntrees=5,
                   fold_assignment="Random")
    try:
        h2o.check_models(gbm1, gbm2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2,5)
    fold_assignments = h2o.H2OFrame(python_obj=[[random.randint(0,num_folds-1)] for f in range(cars.nrow())])
    fold_assignments.setNames(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], training_frame=cars, distribution=distribution, ntrees=5,
                  fold_column="fold_assignments", keep_cross_validation_predictions=True)
    num_cv_models = len(gbm._model_json['output']['cross_validation_models'])
    assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                                    "{1}".format(num_folds, num_cv_models)
    cv_model1 = h2o.get_model(gbm._model_json['output']['cross_validation_models'][0]['name'])
    cv_model2 = h2o.get_model(gbm._model_json['output']['cross_validation_models'][1]['name'])
    assert isinstance(cv_model1, type(gbm)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model1),type(gbm))
    assert isinstance(cv_model2, type(gbm)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model2),type(gbm))

    # 4. keep_cross_validation_predictions
    cv_predictions = gbm1._model_json['output']['cross_validation_predictions']
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions)

    cv_predictions = gbm._model_json['output']['cross_validation_predictions']
    assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                                          "as folds, but got {0}".format(len(cv_predictions))

    # # 5. manually construct models
    # fold1 = cars[cars["fold_assignments"]==0]
    # fold2 = cars[cars["fold_assignments"]==1]
    # manual_model1 = h2o.gbm(y=fold2[response_col],
    #                         x=fold2[predictors],
    #                         validation_y=fold1[response_col],
    #                         validation_x=fold1[predictors], ntrees=5,
    #                         distribution=distribution)
    # manual_model2 = h2o.gbm(y=fold1[response_col],
    #                         x=fold1[predictors],
    #                         validation_y=fold2[response_col],
    #                         validation_x=fold2[predictors], ntrees=5,
    #                         distribution=distribution)


    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow(), distribution=distribution, ntrees=5,
                  fold_assignment="Modulo")

    # 2. nfolds = 0
    gbm1 = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=0, distribution=distribution, ntrees=5)
    # check that this is equivalent to no nfolds
    gbm2 = h2o.gbm(y=cars[response_col], x=cars[predictors], distribution=distribution, ntrees=5)
    h2o.check_models(gbm1, gbm2)

    # 3. cross-validation and regular validation attempted
    gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3,10), validation_y=cars[response_col], ntrees=5,
                  validation_x=cars[predictors], distribution=distribution)


    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1,1], 1)[0], ntrees=5,
                      distribution=distribution)
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow()+1, distribution=distribution, ntrees=5,
                      fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments", ntrees=5,
                      distribution=distribution, training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True

    # 4. fold_column and fold_assignment both specified
    try:
        gbm = h2o.gbm(y=cars[response_col], x=cars[predictors], fold_assignment="Random", fold_column="fold_assignments", ntrees=5,
                      distribution=distribution, training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and fold_assignment both specified"
    except EnvironmentError:
        assert True
def cv_carsDL(ip,port):

    # read in the dataset and construct training set (and validation set)
    cars =  h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3),1)[0]

    # pick the predictors and the correct response column
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2 :
        response_col = "cylinders"
        cars[response_col] = cars[response_col].asfactor()
    else              :
        response_col = "economy"

    print "Response column: {0}".format(response_col)

    ## cross-validation
    ## basic
    dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3,10), fold_assignment="Modulo")

    ## check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3,10)
    dl1 = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random")
    dl2 = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random")
    try:
        h2o.check_models(dl1, dl2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True


    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    # TODO: manually construct the cross-validation metrics and compare
    dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow(), fold_assignment="Modulo")

    # 2. nfolds = 0
    dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=0)

    # 3. cross-validation and regular validation attempted
    dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3,10),
                           validation_y=cars[response_col], validation_x=cars[predictors])


    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1,1], 1)[0])
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        dl = h2o.deeplearning(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow()+1, fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True
Beispiel #4
0
def cv_carsRF(ip,port):

    # read in the dataset and construct training set (and validation set)
    cars =  h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3),1)[0]

    # pick the predictors and the correct response column
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2 :
        response_col = "cylinders"
        cars[response_col] = cars[response_col].asfactor()
    else              :
        response_col = "economy"

    print "Response column: {0}".format(response_col)

    ## cross-validation
    # 1. check that cv metrics are the same over repeated seeded "Modulo" runs
    nfolds = random.randint(3,10)
    rf1 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Modulo", seed=1234)
    rf2 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Modulo", seed=1234)
    h2o.check_models(rf1, rf2, True)

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3,10)
    rf1 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random")
    rf2 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random")
    try:
        h2o.check_models(rf1, rf2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2,5)
    fold_assignments = h2o.H2OFrame(python_obj=[[random.randint(0,num_folds-1)] for f in range(cars.nrow())])
    fold_assignments.setNames(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], training_frame=cars,
                           fold_column="fold_assignments", keep_cross_validation_predictions=True)
    num_cv_models = len(rf._model_json['output']['cross_validation_models'])
    assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                                    "{1}".format(num_folds, num_cv_models)
    cv_model1 = h2o.get_model(rf._model_json['output']['cross_validation_models'][0]['name'])
    cv_model2 = h2o.get_model(rf._model_json['output']['cross_validation_models'][1]['name'])
    assert isinstance(cv_model1, type(rf)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model1),type(rf))
    assert isinstance(cv_model2, type(rf)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model2),type(rf))

    # 4. keep_cross_validation_predictions
    cv_predictions = rf1._model_json['output']['cross_validation_predictions']
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions)

    cv_predictions = rf._model_json['output']['cross_validation_predictions']
    assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                                          "as folds, but got {0}".format(len(cv_predictions))


    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow(), fold_assignment="Modulo")

    # 2. nfolds = 0
    rf1 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=0, seed=1234)
    # check that this is equivalent to no nfolds
    rf2 = h2o.random_forest(y=cars[response_col], x=cars[predictors], seed=1234)
    h2o.check_models(rf1, rf2)

    # 3. cross-validation and regular validation attempted
    rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3,10),
                           validation_y=cars[response_col], validation_x=cars[predictors])


    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1,1], 1)[0])
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow()+1, fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments",
                               training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True
Beispiel #5
0
def cv_carsRF(ip, port):

    # read in the dataset and construct training set (and validation set)
    cars = h2o.import_frame(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3), 1)[0]

    # pick the predictors and the correct response column
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == 1:
        response_col = "economy_20mpg"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2:
        response_col = "cylinders"
        cars[response_col] = cars[response_col].asfactor()
    else:
        response_col = "economy"

    print "Response column: {0}".format(response_col)

    ## cross-validation
    # 1. check that cv metrics are the same over repeated seeded "Modulo" runs
    nfolds = random.randint(3, 10)
    rf1 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=nfolds,
                            fold_assignment="Modulo",
                            seed=1234)
    rf2 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=nfolds,
                            fold_assignment="Modulo",
                            seed=1234)
    h2o.check_models(rf1, rf2, True)

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3, 10)
    rf1 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=nfolds,
                            fold_assignment="Random")
    rf2 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=nfolds,
                            fold_assignment="Random")
    try:
        h2o.check_models(rf1, rf2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2, 5)
    fold_assignments = h2o.H2OFrame(
        python_obj=[[random.randint(0, num_folds - 1)]
                    for f in range(cars.nrow())])
    fold_assignments.setNames(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    rf = h2o.random_forest(y=cars[response_col],
                           x=cars[predictors],
                           training_frame=cars,
                           fold_column="fold_assignments",
                           keep_cross_validation_predictions=True)
    num_cv_models = len(rf._model_json['output']['cross_validation_models'])
    assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                                    "{1}".format(num_folds, num_cv_models)
    cv_model1 = h2o.get_model(
        rf._model_json['output']['cross_validation_models'][0]['name'])
    cv_model2 = h2o.get_model(
        rf._model_json['output']['cross_validation_models'][1]['name'])
    assert isinstance(cv_model1, type(rf)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model1),type(rf))
    assert isinstance(cv_model2, type(rf)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model2),type(rf))

    # 4. keep_cross_validation_predictions
    cv_predictions = rf1._model_json['output']['cross_validation_predictions']
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(
        cv_predictions)

    cv_predictions = rf._model_json['output']['cross_validation_predictions']
    assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                                          "as folds, but got {0}".format(len(cv_predictions))

    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    rf = h2o.random_forest(y=cars[response_col],
                           x=cars[predictors],
                           nfolds=cars.nrow(),
                           fold_assignment="Modulo")

    # 2. nfolds = 0
    rf1 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=0,
                            seed=1234)
    # check that this is equivalent to no nfolds
    rf2 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            seed=1234)
    h2o.check_models(rf1, rf2)

    # 3. cross-validation and regular validation attempted
    rf = h2o.random_forest(y=cars[response_col],
                           x=cars[predictors],
                           nfolds=random.randint(3, 10),
                           validation_y=cars[response_col],
                           validation_x=cars[predictors])

    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        rf = h2o.random_forest(y=cars[response_col],
                               x=cars[predictors],
                               nfolds=random.sample([-1, 1], 1)[0])
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        rf = h2o.random_forest(y=cars[response_col],
                               x=cars[predictors],
                               nfolds=cars.nrow() + 1,
                               fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        rf = h2o.random_forest(y=cars[response_col],
                               x=cars[predictors],
                               nfolds=3,
                               fold_column="fold_assignments",
                               training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True