Ejemplo n.º 1
0
def cv_carsRF():

    # read in the dataset and construct training set (and validation set)
    cars =  h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3),1)[0]

    # pick the predictors and the correct response column
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2 :
        response_col = "cylinders"
        cars[response_col] = cars[response_col].asfactor()
    else              :
        response_col = "economy"

    print "Response column: {0}".format(response_col)

    ## cross-validation
    # 1. check that cv metrics are the same over repeated seeded "Modulo" runs
    nfolds = random.randint(3,10)
    rf1 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Modulo", seed=1234)
    rf2 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Modulo", seed=1234)
    tests.check_models(rf1, rf2, True)

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3,10)
    rf1 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random")
    rf2 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=nfolds, fold_assignment="Random")
    try:
        tests.check_models(rf1, rf2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2,5)
    fold_assignments = h2o.H2OFrame(python_obj=[[random.randint(0,num_folds-1)] for f in range(cars.nrow)])
    fold_assignments.setNames(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], training_frame=cars,
                           fold_column="fold_assignments", keep_cross_validation_predictions=True)
    num_cv_models = len(rf._model_json['output']['cross_validation_models'])
    assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                                    "{1}".format(num_folds, num_cv_models)
    cv_model1 = h2o.get_model(rf._model_json['output']['cross_validation_models'][0]['name'])
    cv_model2 = h2o.get_model(rf._model_json['output']['cross_validation_models'][1]['name'])
    assert isinstance(cv_model1, type(rf)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model1),type(rf))
    assert isinstance(cv_model2, type(rf)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model2),type(rf))

    # 4. keep_cross_validation_predictions
    cv_predictions = rf1._model_json['output']['cross_validation_predictions']
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions)

    cv_predictions = rf._model_json['output']['cross_validation_predictions']
    assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                                          "as folds, but got {0}".format(len(cv_predictions))


    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow, fold_assignment="Modulo")

    # 2. nfolds = 0
    rf1 = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=0, seed=1234)
    # check that this is equivalent to no nfolds
    rf2 = h2o.random_forest(y=cars[response_col], x=cars[predictors], seed=1234)
    tests.check_models(rf1, rf2)

    # 3. cross-validation and regular validation attempted
    rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3,10),
                           validation_y=cars[response_col], validation_x=cars[predictors])


    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1,1], 1)[0])
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow+1, fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        rf = h2o.random_forest(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments",
                               training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True
Ejemplo n.º 2
0
def cv_carsRF():

    # read in the dataset and construct training set (and validation set)
    cars = h2o.import_file(path=tests.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3), 1)[0]

    # pick the predictors and the correct response column
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == 1:
        response_col = "economy_20mpg"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2:
        response_col = "cylinders"
        cars[response_col] = cars[response_col].asfactor()
    else:
        response_col = "economy"

    print "Response column: {0}".format(response_col)

    ## cross-validation
    # 1. check that cv metrics are the same over repeated seeded "Modulo" runs
    nfolds = random.randint(3, 10)
    rf1 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=nfolds,
                            fold_assignment="Modulo",
                            seed=1234)
    rf2 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=nfolds,
                            fold_assignment="Modulo",
                            seed=1234)
    tests.check_models(rf1, rf2, True)

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3, 10)
    rf1 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=nfolds,
                            fold_assignment="Random")
    rf2 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=nfolds,
                            fold_assignment="Random")
    try:
        tests.check_models(rf1, rf2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2, 5)
    fold_assignments = h2o.H2OFrame(
        python_obj=[[random.randint(0, num_folds - 1)]
                    for f in range(cars.nrow)])
    fold_assignments.set_names(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    rf = h2o.random_forest(y=cars[response_col],
                           x=cars[predictors],
                           training_frame=cars,
                           fold_column="fold_assignments",
                           keep_cross_validation_predictions=True)
    num_cv_models = len(rf._model_json['output']['cross_validation_models'])
    assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                                    "{1}".format(num_folds, num_cv_models)
    cv_model1 = h2o.get_model(
        rf._model_json['output']['cross_validation_models'][0]['name'])
    cv_model2 = h2o.get_model(
        rf._model_json['output']['cross_validation_models'][1]['name'])
    assert isinstance(cv_model1, type(rf)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model1),type(rf))
    assert isinstance(cv_model2, type(rf)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model2),type(rf))

    # 4. keep_cross_validation_predictions
    cv_predictions = rf1._model_json['output']['cross_validation_predictions']
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(
        cv_predictions)

    cv_predictions = rf._model_json['output']['cross_validation_predictions']
    assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                                          "as folds, but got {0}".format(len(cv_predictions))

    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    rf = h2o.random_forest(y=cars[response_col],
                           x=cars[predictors],
                           nfolds=cars.nrow,
                           fold_assignment="Modulo")

    # 2. nfolds = 0
    rf1 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            nfolds=0,
                            seed=1234)
    # check that this is equivalent to no nfolds
    rf2 = h2o.random_forest(y=cars[response_col],
                            x=cars[predictors],
                            seed=1234)
    tests.check_models(rf1, rf2)

    # 3. cross-validation and regular validation attempted
    rf = h2o.random_forest(y=cars[response_col],
                           x=cars[predictors],
                           nfolds=random.randint(3, 10),
                           validation_y=cars[response_col],
                           validation_x=cars[predictors])

    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        rf = h2o.random_forest(y=cars[response_col],
                               x=cars[predictors],
                               nfolds=random.sample([-1, 1], 1)[0])
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        rf = h2o.random_forest(y=cars[response_col],
                               x=cars[predictors],
                               nfolds=cars.nrow + 1,
                               fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        rf = h2o.random_forest(y=cars[response_col],
                               x=cars[predictors],
                               nfolds=3,
                               fold_column="fold_assignments",
                               training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True
Ejemplo n.º 3
0
def cv_carsGLM():

    # read in the dataset and construct training set (and validation set)
    cars =  h2o.import_file(path=h2o.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:poisson
    problem = random.sample(range(3),1)[0]
    # pick the predictors and response column, along with the correct family
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        family = "binomial"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2 :
        family = "poisson"
        response_col = "cylinders"
    else              :
        family = "gaussian"
        response_col = "economy"

    print "Distribution: {0}".format(family)
    print "Response column: {0}".format(response_col)

    ## cross-validation
    # 1. check that cv metrics are the same over repeated "Modulo" runs
    nfolds = random.randint(3,10)
    glm1 = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, family=family, fold_assignment="Modulo")
    glm2 = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, family=family, fold_assignment="Modulo")
    tests.check_models(glm1, glm2, True)

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3,10)
    glm1 = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, family=family, fold_assignment="Random")
    glm2 = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=nfolds, family=family, fold_assignment="Random")
    try:
        tests.check_models(glm1, glm2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2,5)
    fold_assignments = h2o.H2OFrame(python_obj=[[random.randint(0,num_folds-1)] for f in range(cars.nrow)])
    fold_assignments.setNames(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    glm = h2o.glm(y=cars[response_col], x=cars[predictors], training_frame=cars, family=family,
                  fold_column="fold_assignments", keep_cross_validation_predictions=True)
    num_cv_models = len(glm._model_json['output']['cross_validation_models'])
    assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                     "{1}".format(num_folds, num_cv_models)
    cv_model1 = h2o.get_model(glm._model_json['output']['cross_validation_models'][0]['name'])
    cv_model2 = h2o.get_model(glm._model_json['output']['cross_validation_models'][1]['name'])
    assert isinstance(cv_model1, type(glm)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model1),type(glm))
    assert isinstance(cv_model2, type(glm)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model2),type(glm))

    # 4. keep_cross_validation_predictions
    cv_predictions = glm1._model_json['output']['cross_validation_predictions']
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(cv_predictions)

    cv_predictions = glm._model_json['output']['cross_validation_predictions']
    assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                           "as folds, but got {0}".format(len(cv_predictions))

    # # 5. manually construct models
    # fold1 = cars[cars["fold_assignments"]==0]
    # fold2 = cars[cars["fold_assignments"]==1]
    # manual_model1 = h2o.glm(y=fold2[response_col],
    #                         x=fold2[predictors],
    #                         validation_y=fold1[response_col],
    #                         validation_x=fold1[predictors],
    #                         family=family)
    # manual_model2 = h2o.glm(y=fold1[response_col],
    #                         x=fold1[predictors],
    #                         validation_y=fold2[response_col],
    #                         validation_x=fold2[predictors],
    #                         family=family)


    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    # TODO: PUBDEV-1776
    #glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow, family=family,
    #              fold_assignment="Modulo")

    # 2. nfolds = 0
    glm1 = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=0, family=family)
    # check that this is equivalent to no nfolds
    glm2 = h2o.glm(y=cars[response_col], x=cars[predictors], family=family)
    tests.check_models(glm1, glm2)

    # 3. cross-validation and regular validation attempted
    glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=random.randint(3,10), validation_y=cars[response_col],
                  validation_x=cars[predictors], family=family)


    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=random.sample([-1,1], 1)[0],
                      family=family)
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow+1, family=family,
                      fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=3, fold_column="fold_assignments",
                      family=family, training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True
Ejemplo n.º 4
0
def cv_carsGLM():

    # read in the dataset and construct training set (and validation set)
    cars = h2o.import_file(path=tests.locate("smalldata/junit/cars_20mpg.csv"))

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:poisson
    problem = random.sample(range(3), 1)[0]
    # pick the predictors and response column, along with the correct family
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == 1:
        response_col = "economy_20mpg"
        family = "binomial"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2:
        family = "poisson"
        response_col = "cylinders"
    else:
        family = "gaussian"
        response_col = "economy"

    print "Distribution: {0}".format(family)
    print "Response column: {0}".format(response_col)

    ## cross-validation
    # 1. check that cv metrics are the same over repeated "Modulo" runs
    nfolds = random.randint(3, 10)
    glm1 = h2o.glm(y=cars[response_col],
                   x=cars[predictors],
                   nfolds=nfolds,
                   family=family,
                   fold_assignment="Modulo")
    glm2 = h2o.glm(y=cars[response_col],
                   x=cars[predictors],
                   nfolds=nfolds,
                   family=family,
                   fold_assignment="Modulo")
    tests.check_models(glm1, glm2, True)

    # 2. check that cv metrics are different over repeated "Random" runs
    nfolds = random.randint(3, 10)
    glm1 = h2o.glm(y=cars[response_col],
                   x=cars[predictors],
                   nfolds=nfolds,
                   family=family,
                   fold_assignment="Random")
    glm2 = h2o.glm(y=cars[response_col],
                   x=cars[predictors],
                   nfolds=nfolds,
                   family=family,
                   fold_assignment="Random")
    try:
        tests.check_models(glm1, glm2, True)
        assert False, "Expected models to be different over repeated Random runs"
    except AssertionError:
        assert True

    # 3. folds_column
    num_folds = random.randint(2, 5)
    fold_assignments = h2o.H2OFrame(
        python_obj=[[random.randint(0, num_folds - 1)]
                    for f in range(cars.nrow)])
    fold_assignments.set_names(["fold_assignments"])
    cars = cars.cbind(fold_assignments)
    glm = h2o.glm(y=cars[response_col],
                  x=cars[predictors],
                  training_frame=cars,
                  family=family,
                  fold_column="fold_assignments",
                  keep_cross_validation_predictions=True)
    num_cv_models = len(glm._model_json['output']['cross_validation_models'])
    assert num_cv_models==num_folds, "Expected {0} cross-validation models, but got " \
                                     "{1}".format(num_folds, num_cv_models)
    cv_model1 = h2o.get_model(
        glm._model_json['output']['cross_validation_models'][0]['name'])
    cv_model2 = h2o.get_model(
        glm._model_json['output']['cross_validation_models'][1]['name'])
    assert isinstance(cv_model1, type(glm)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model1),type(glm))
    assert isinstance(cv_model2, type(glm)), "Expected cross-validation model to be the same model type as the " \
                                             "constructed model, but got {0} and {1}".format(type(cv_model2),type(glm))

    # 4. keep_cross_validation_predictions
    cv_predictions = glm1._model_json['output']['cross_validation_predictions']
    assert cv_predictions is None, "Expected cross-validation predictions to be None, but got {0}".format(
        cv_predictions)

    cv_predictions = glm._model_json['output']['cross_validation_predictions']
    assert len(cv_predictions)==num_folds, "Expected the same number of cross-validation predictions " \
                                           "as folds, but got {0}".format(len(cv_predictions))

    # # 5. manually construct models
    # fold1 = cars[cars["fold_assignments"]==0]
    # fold2 = cars[cars["fold_assignments"]==1]
    # manual_model1 = h2o.glm(y=fold2[response_col],
    #                         x=fold2[predictors],
    #                         validation_y=fold1[response_col],
    #                         validation_x=fold1[predictors],
    #                         family=family)
    # manual_model2 = h2o.glm(y=fold1[response_col],
    #                         x=fold1[predictors],
    #                         validation_y=fold2[response_col],
    #                         validation_x=fold2[predictors],
    #                         family=family)

    ## boundary cases
    # 1. nfolds = number of observations (leave-one-out cross-validation)
    # TODO: PUBDEV-1776
    #glm = h2o.glm(y=cars[response_col], x=cars[predictors], nfolds=cars.nrow, family=family,
    #              fold_assignment="Modulo")

    # 2. nfolds = 0
    glm1 = h2o.glm(y=cars[response_col],
                   x=cars[predictors],
                   nfolds=0,
                   family=family)
    # check that this is equivalent to no nfolds
    glm2 = h2o.glm(y=cars[response_col], x=cars[predictors], family=family)
    tests.check_models(glm1, glm2)

    # 3. cross-validation and regular validation attempted
    glm = h2o.glm(y=cars[response_col],
                  x=cars[predictors],
                  nfolds=random.randint(3, 10),
                  validation_y=cars[response_col],
                  validation_x=cars[predictors],
                  family=family)

    ## error cases
    # 1. nfolds == 1 or < 0
    try:
        glm = h2o.glm(y=cars[response_col],
                      x=cars[predictors],
                      nfolds=random.sample([-1, 1], 1)[0],
                      family=family)
        assert False, "Expected model-build to fail when nfolds is 1 or < 0"
    except EnvironmentError:
        assert True

    # 2. more folds than observations
    try:
        glm = h2o.glm(y=cars[response_col],
                      x=cars[predictors],
                      nfolds=cars.nrow + 1,
                      family=family,
                      fold_assignment="Modulo")
        assert False, "Expected model-build to fail when nfolds > nobs"
    except EnvironmentError:
        assert True

    # 3. fold_column and nfolds both specified
    try:
        glm = h2o.glm(y=cars[response_col],
                      x=cars[predictors],
                      nfolds=3,
                      fold_column="fold_assignments",
                      family=family,
                      training_frame=cars)
        assert False, "Expected model-build to fail when fold_column and nfolds both specified"
    except EnvironmentError:
        assert True