def binomial_pd_multi_plot():
    train = h2o.upload_file(
        pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    y = "CAPSULE"
    train[y] = train[y].asfactor()
    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    with TemporaryDirectory() as tmpdir:
        path1 = "{}/plot1.png".format(tmpdir)
        path2 = "{}/plot2.png".format(tmpdir)
        for col in cols_to_test:
            test_plot_result_saving(
                aml.pd_multi_plot(train, col), path2,
                aml.pd_multi_plot(train, col, save_plot_path=path1), path1)
        matplotlib.pyplot.close()
def std_coef__varimp():
    # import data set
    cars = h2o.import_file(
        pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))

    # Constructing validation and train sets by sampling (20/80)
    s = cars[0].runif()
    cars_train = cars[s <= 0.8]
    cars_valid = cars[s > 0.8]

    # set list of features, target, and convert target to factor
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    response_col = "economy_20mpg"
    cars[response_col] = cars[response_col].asfactor()

    # Build and train a GLM model
    cars_glm = H2OGeneralizedLinearEstimator()
    cars_glm.train(x=predictors,
                   y=response_col,
                   training_frame=cars_train,
                   validation_frame=cars_valid)

    # test saving:
    with TemporaryDirectory() as tmpdir:
        path = "{}/plot1.png".format(tmpdir)
        test_plot_result_saving(
            cars_glm.std_coef_plot(server=True), "{}/plot2.png".format(tmpdir),
            cars_glm.std_coef_plot(server=True, save_plot_path=path), path)

        test_plot_result_saving(
            cars_glm.varimp_plot(server=True), "{}/plot2.png".format(tmpdir),
            cars_glm.varimp_plot(server=True, save_plot_path=path), path)
def partial_plots():
    data = h2o.import_file(
        pyunit_utils.locate('smalldata/prostate/prostate.csv'))

    x = ['AGE', 'RACE']
    y = 'CAPSULE'
    data[y] = data[y].asfactor()
    data['RACE'] = data['RACE'].asfactor()

    gbm_model = H2OGradientBoostingEstimator(ntrees=50, learn_rate=0.05)
    gbm_model.train(x=x, y=y, training_frame=data)

    # test saving:
    with TemporaryDirectory() as tmpdir:
        path1 = "{}/plot1.png".format(tmpdir)
        path2 = "{}/plot2.png".format(tmpdir)
        test_plot_result_saving(
            gbm_model.partial_plot(data=data,
                                   cols=['AGE'],
                                   server=True,
                                   plot=True,
                                   row_index=1), path2,
            gbm_model.partial_plot(data=data,
                                   cols=['AGE'],
                                   server=True,
                                   plot=True,
                                   row_index=1,
                                   save_plot_path=path1), path1)
Example #4
0
def varimp_plot_test():
  kwargs = {}
  kwargs['server'] = True
  
  # import data set
  cars = h2o.import_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
  
  # Constructing validation and train sets by sampling (20/80)
  s = cars[0].runif()
  cars_train = cars[s <= 0.8]
  cars_valid = cars[s > 0.8]

  # set list of features, target, and convert target to factor
  predictors = ["displacement", "power", "weight", "acceleration", "year"]
  response_col = "economy_20mpg"
  cars[response_col] = cars[response_col].asfactor()

  # Build and train a DRF model
  # to do: comment this out
  cars_rf = H2ORandomForestEstimator()
  cars_rf.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid)

  #Plot DRF Variable Importances, check that num_of_features accepts input
  cars_rf.varimp_plot(server=True)
  cars_rf.varimp_plot(num_of_features=2, server=True)

  # test saving:
  tmpdir = tempfile.mkdtemp(prefix="h2o-func")
  path="{}/plot1.png".format(tmpdir)
  test_plot_result_saving(cars_rf.varimp_plot(server=True), "{}/plot2.png".format(tmpdir), cars_rf.varimp_plot(server=True, save_plot_path=path), path)

  # Build and train a GBM model
  cars_gbm = H2OGradientBoostingEstimator()
  cars_gbm.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid)

  # Plot GBM Variable Importances
  cars_gbm.varimp_plot(server=True)
  cars_gbm.varimp_plot(num_of_features=2, server=True)

  # Build and train a Deep Learning model
  cars_dl = H2ODeepLearningEstimator(variable_importances=True)
  cars_dl.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid)

  # Plot Deep Learning Variable Importances
  cars_dl.varimp_plot(server=True)
  cars_dl.varimp_plot(num_of_features=2, server=True)

  # check that varimp_plot() uses std_coef_plot() for a glm
  cars_glm = H2OGeneralizedLinearEstimator()
  cars_glm.train(x=predictors, y=response_col, training_frame=cars_train, validation_frame=cars_valid)
  cars_glm.varimp_plot(server=True)
  cars_glm.varimp_plot(num_of_features=2, server=True)
def test_hist():
    df = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris.csv"))

    with TemporaryDirectory() as tmpdir:
        path1 = "{}/plot1.png".format(tmpdir)
        path2 = "{}/plot2.png".format(tmpdir)
        test_plot_result_saving(
            df[0].hist(breaks=5, plot=True), path2,
            df[0].hist(breaks=5, plot=True, save_plot_path=path1), path1)

    h = df[0].hist(breaks=5, plot=True)
    assert h.nrow == 5

    h = df[0].hist(breaks=[0, 0.5, 2, 3], plot=True)
    assert h.nrow == 4
def screeplot():
    kwargs = {}
    kwargs['server'] = True
    australia = h2o.upload_file(
        pyunit_utils.locate("smalldata/pca_test/AustraliaCoast.csv"))
    australia_pca = H2OPCA(k=4, transform="STANDARDIZE")
    australia_pca.train(x=list(range(8)), training_frame=australia)
    australia_pca.screeplot(type="barplot", **kwargs)
    screeplot_result = australia_pca.screeplot(type="lines", **kwargs)
    with TemporaryDirectory() as tmpdir:
        path = "{}/plot1.png".format(tmpdir)
        test_plot_result_saving(
            screeplot_result, "{}/plot2.png".format(tmpdir),
            australia_pca.screeplot(type="barplot",
                                    save_plot_path=path,
                                    **kwargs), path)
def roc_pr_curve():
    air = h2o.import_file(
        pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))

    # Constructing test and train sets by sampling (20/80)
    s = air[0].runif()
    air_train = air[s <= 0.8]
    air_valid = air[s > 0.8]

    myX = [
        "Origin", "Dest", "Distance", "UniqueCarrier", "fMonth", "fDayofMonth",
        "fDayOfWeek"
    ]
    myY = "IsDepDelayed"

    air_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                           ntrees=100,
                                           max_depth=3,
                                           learn_rate=0.01)
    air_gbm.train(x=myX,
                  y=myY,
                  training_frame=air_train,
                  validation_frame=air_valid)

    # Plot ROC for valid set
    perf_valid = air_gbm.model_performance(valid=True)
    with TemporaryDirectory() as tmpdir:
        path1 = "{}/plot1.png".format(tmpdir)
        path2 = "{}/plot2.png".format(tmpdir)
        test_plot_result_saving(
            perf_valid.plot(type="roc", server=False), path2,
            perf_valid.plot(type="roc", server=False, save_to_file=path1),
            path1)

        # Plot ROC for test set
        air_test = h2o.import_file(
            pyunit_utils.locate("smalldata/airlines/AirlinesTest.csv.zip"))
        perf_test = air_gbm.model_performance(air_test)

        test_plot_result_saving(
            perf_test.plot(type="roc", server=False), path2,
            perf_test.plot(type="roc", server=False, save_plot_path=path1),
            path1)
def partial_plots_multinomial():
    iris = h2o.import_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

    iris['class'] = iris['class'].asfactor()
    iris['random_cat'] = iris['class']

    predictors = iris.col_names[:-1]
    response = 'class'

    train, valid = iris.split_frame(ratios=[.8], seed=1234)

    model = H2OGeneralizedLinearEstimator(family='multinomial')
    model.train(x=predictors,
                y=response,
                training_frame=train,
                validation_frame=valid)

    targets = ["Iris-setosa", "Iris-versicolor"]
    cols = ["random_cat"]

    with TemporaryDirectory() as tmpdir:
        path1 = "{}/plot1.png".format(tmpdir)
        path2 = "{}/plot2.png".format(tmpdir)

        test_plot_result_saving(model.plot(), path2,
                                model.plot(save_plot_path=path1), path1)

        test_plot_result_saving(
            model.partial_plot(data=iris,
                               cols=cols,
                               targets=targets,
                               plot_stddev=True,
                               plot=True,
                               server=True), path2,
            model.partial_plot(data=iris,
                               cols=cols,
                               targets=targets,
                               plot_stddev=True,
                               plot=True,
                               server=True,
                               save_to_file=path1), path1)
def test_varimp_heatmap_model_correlation_heatmap():
    train = h2o.upload_file(
        pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    y = "CAPSULE"
    train[y] = train[y].asfactor()
    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    aml = H2OAutoML(seed=1234, max_models=5)
    aml.train(y=y, training_frame=train)

    models = [
        h2o.get_model(m[0])
        for m in aml.leaderboard["model_id"].as_data_frame(use_pandas=False,
                                                           header=False)
    ]

    # Test named models as well
    gbm = H2OGradientBoostingEstimator(model_id="my_awesome_model")
    gbm.train(y=y, training_frame=train)
    models += [gbm]

    with TemporaryDirectory() as tmpdir:
        path1 = "{}/plot1.png".format(tmpdir)
        path2 = "{}/plot2.png".format(tmpdir)
        test_plot_result_saving(
            h2o.varimp_heatmap(models), path2,
            h2o.varimp_heatmap(models, save_plot_path=path1), path1)
        test_plot_result_saving(
            h2o.model_correlation_heatmap(models, train), path2,
            h2o.model_correlation_heatmap(models, train, save_plot_path=path1),
            path1)
    h2o.varimp_heatmap(models)
def binomial_plot_test():
    benign = h2o.import_file(
        pyunit_utils.locate("smalldata/logreg/benign.csv"))
    response = 3
    predictors = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10]
    model = glm(family="binomial")
    model.train(x=predictors, y=response, training_frame=benign)

    # test saving:
    with TemporaryDirectory() as tmpdir:
        path1 = "{}/plot1.png".format(tmpdir)
        path2 = "{}/plot2.png".format(tmpdir)
        test_plot_result_saving(
            model.plot(timestep="AUTO", metric="objective", server=True),
            path2,
            model.plot(timestep="AUTO",
                       metric="objective",
                       server=True,
                       save_plot_path=path1), path1)

        test_plot_result_saving(
            model.permutation_importance_plot(benign), path2,
            model.permutation_importance_plot(benign, save_plot_path=path1),
            path1)
def regression__plot_learning_curve_plot():
    train = h2o.upload_file(
        pyunit_utils.locate("smalldata/titanic/titanic_expanded.csv"))
    y = "fare"

    # get at most one column from each type
    cols_to_test = []
    for col, typ in train.types.items():
        for ctt in cols_to_test:
            if typ == train.types[ctt] or col == y:
                break
        else:
            cols_to_test.append(col)

    gbm = H2OGradientBoostingEstimator(seed=1234, model_id="my_awesome_model")
    gbm.train(y=y, training_frame=train)

    with TemporaryDirectory() as tmpdir:
        path1 = "{}/plot1.png".format(tmpdir)
        path2 = "{}/plot2.png".format(tmpdir)
        # test saving:
        test_plot_result_saving(gbm.pd_plot(train, col), path2,
                                gbm.pd_plot(train, col, save_plot_path=path1),
                                path1)
        matplotlib.pyplot.close()

        # test pd_plot
        for col in cols_to_test:
            try:
                test_plot_result_saving(
                    gbm.pd_plot(train, col), path2,
                    gbm.pd_plot(train, col, save_plot_path=path1), path1)
            except ValueError:
                assert col == "name", "'name' is a string column which is not supported."
            matplotlib.pyplot.close("all")

        for metric in ["auto", "deviance", "rmse"]:
            test_plot_result_saving(
                gbm.learning_curve_plot(metric), path2,
                gbm.learning_curve_plot(metric=metric.upper(),
                                        save_plot_path=path1), path1)
        matplotlib.pyplot.close("all")