Exemple #1
0
import os

sys.path.insert(0, sys.argv[1])
os.environ['PYSPARK_PYTHON'] = sys.executable
import unittest
from pysparkling.context import H2OContext
from pysparkling.conf import H2OConf
from pyspark.sql import SparkSession

import unit_test_utils
import generic_test_utils


class H2OConfTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls._cloud_name = generic_test_utils.unique_cloud_name("h2o_conf_test")
        cls._conf = unit_test_utils.get_default_spark_conf(cls._spark_options_from_params). \
            set("spark.ext.h2o.cloud.name", cls._cloud_name)
        cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate()
        cls._hc = H2OContext.getOrCreate(cls._spark, H2OConf(cls._spark).set_num_of_external_h2o_nodes(1))

    # test passing h2o_conf to H2OContext
    def test_h2o_conf(self):
        self.assertEquals(self._hc.get_conf().cloud_name(), self._cloud_name,
                          "Configuration property cloud_name should match")


if __name__ == '__main__':
    generic_test_utils.run_tests([H2OConfTest], file_name="py_unit_tests_conf_report")
        df = self._spark.createDataFrame(data)
        hf = hc.as_h2o_frame(df)
        # Modify H2O frame - this should invalidate internal cache
        hf['c3'] = 3
        # Now try to convert modified H2O frame back to Spark data frame
        dfe = hc.as_spark_frame(hf)
        self.assertEquals(dfe.count(), len(data),
                          "Number of rows should match")
        self.assertEquals(len(dfe.columns), 3,
                          "Number of columns should match")
        self.assertEquals(
            dfe.collect(),
            [Row(c1=1, c2='first', c3=3),
             Row(c1=2, c2='second', c3=3)])

    def test_sparse_data_conversion(self):
        data = [(float(x), SparseVector(50000, {x: float(x)}))
                for x in range(1, 90)]
        df = self._spark.sparkContext.parallelize(data).toDF()

        t0 = time.time()
        self._hc.as_h2o_frame(df)
        t1 = time.time()
        total = t1 - t0

        assert total < 10  # The conversion should not take longer then 10 seconds


if __name__ == '__main__':
    generic_test_utils.run_tests([FrameTransformationsTest],
                                 file_name="py_unit_tests_conversions_report")
sys.path.insert(0, sys.argv[1])
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from integ_test_utils import *
from generic_test_utils import run_tests


class YarnIntegTestSuite(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        conf = get_default_spark_conf(cls._spark_options_from_params)
        conf["spark.master"] = "local[*]"
        conf["spark.submit.pyFiles"] = sys.argv[1]
        # Configure YARN environment
        conf["spark.yarn.max.executor.failures"] = "1"  # In fail of executor, fail the test
        conf["spark.executor.instances"] = "1"
        cls._conf = conf

    def test_xgboost_medium(self):
        return_code = launch(self._conf, "examples/scripts/tests/xgboost_test_medium.py")
        self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code))

    def test_chicago_crime(self):
        return_code = launch(self._conf, "examples/scripts/ChicagoCrimeDemo.py")
        self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code))


if __name__ == '__main__':
    run_tests([YarnIntegTestSuite], file_name="py_integ_yarn_tests_report")
class H2OMojoPredictionsTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls._cloud_name = generic_test_utils.unique_cloud_name("h2o_mojo_predictions_test")
        cls._spark = SparkSession.builder.config(conf = unit_test_utils.get_default_spark_conf()).getOrCreate()

    # test predictions on H2O Mojo
    def test_h2o_mojo_predictions(self):
        # Try loading the Mojo and prediction on it without starting H2O Context
        mojo = H2OMOJOModel.create_from_mojo("../ml/src/test/resources/binom_model_prostate.mojo")
        prostate_frame = self._spark.read.csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True)
        mojo.predict(prostate_frame).repartition(1).collect()

    def test_h2o_mojo_predictions_unseen_categoricals(self):
        mojo = H2OMOJOModel.create_from_mojo("../ml/src/test/resources/deep_learning_airlines_categoricals.zip")
        mojo.setConvertUnknownCategoricalLevelsToNa(True)
        d =[{'sepal_len':5.1, 'sepal_wid':3.5, 'petal_len':1.4, 'petal_wid':0.2, 'class':'Missing_categorical'}]
        df = self._spark.createDataFrame(d)
        data = mojo.transform(df).collect()[0]
        assert data["class"] == "Missing_categorical"
        assert data["petal_len"] == 1.4
        assert data["petal_wid"] == 0.2
        assert data["sepal_len"] == 5.1
        assert data["sepal_wid"] == 3.5
        assert data["prediction_output"][0] == 5.240174068202646


if __name__ == '__main__':
    generic_test_utils.run_tests([H2OMojoPredictionsTest], file_name="py_unit_tests_mojo_predictions_report")
Exemple #5
0
    @classmethod
    def setUpClass(cls):
        conf = get_default_spark_conf(cls._spark_options_from_params)
        conf["spark.master"] = "local[*]"
        conf["spark.submit.pyFiles"] = sys.argv[1]
        # Configure YARN environment
        conf[
            "spark.yarn.max.executor.failures"] = "1"  # In fail of executor, fail the test
        conf["spark.executor.instances"] = "1"
        cls._conf = conf

    def test_xgboost_medium(self):
        return_code = launch(self._conf,
                             "examples/scripts/tests/xgboost_test_medium.py")
        self.assertTrue(
            return_code == 0,
            "Process ended in a wrong way. It ended with return code " +
            str(return_code))

    def test_chicago_crime(self):
        return_code = launch(self._conf,
                             "examples/scripts/ChicagoCrimeDemo.py")
        self.assertTrue(
            return_code == 0,
            "Process ended in a wrong way. It ended with return code " +
            str(return_code))


if __name__ == '__main__':
    run_tests([YarnIntegTestSuite], file_name="py_integ_yarn_tests_report")
    def test_s3n_import(self):
        fr = h2o.import_file("s3n://data.h2o.ai/h2o-open-tour/2016-nyc/weather.csv")
        assert fr.ncol == 27
        assert fr.nrow == 9768

    def test_s3a_import(self):
        fr = h2o.import_file("s3a://data.h2o.ai/h2o-open-tour/2016-nyc/weather.csv")
        assert fr.ncol == 27
        assert fr.nrow == 9768

    def s3_import_export(self, scheme):
        local_frame = h2o.import_file("/home/0xdiag/smalldata/logreg/prostate.csv")
        timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S")
        unique_suffix = str(uuid.uuid4())
        s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                  timestamp + "." + unique_suffix + ".csv.zip"
        h2o.export_file(local_frame, s3_path)
        s3_frame = h2o.import_file(s3_path)
        assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame())

    def test_s3a_import_export(self):
        self.s3_import_export("s3a")

    @unittest.skip("skip")
    def test_s3n_import_export(self):
        self.s3_import_export("s3n")


if __name__ == '__main__':
    generic_test_utils.run_tests([HadoopSmokeTestSuite], file_name="py_hadoop_smoke_tests_report")
        pipeline.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline"))
        loaded_pipeline = Pipeline.load(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline"))

        ## Train the pipeline model
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" +
            os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model"))

        preds = loaded_model.transform(prostate_frame).repartition(1).select(
            mojo.select_prediction_udf("AGE")).take(5)

        assert preds[0][0] == 65.36320409515132
        assert preds[1][0] == 64.96902128114817
        assert preds[2][0] == 64.96721023747583
        assert preds[3][0] == 65.78772654671035
        assert preds[4][0] == 66.11327967814829


if __name__ == '__main__':
    generic_test_utils.run_tests(
        [H2OMojoPipelineTest], file_name="py_unit_tests_mojo_pipeline_report")
Exemple #8
0
    def test_s3a_import(self):
        fr = h2o.import_file(
            "s3a://data.h2o.ai/h2o-open-tour/2016-nyc/weather.csv")
        assert fr.ncol == 27
        assert fr.nrow == 9768

    def s3_import_export(self, scheme):
        local_frame = h2o.import_file(
            "/home/0xdiag/smalldata/logreg/prostate.csv")
        timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S")
        unique_suffix = str(uuid.uuid4())
        s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                  timestamp + "." + unique_suffix + ".csv.zip"
        h2o.export_file(local_frame, s3_path)
        s3_frame = h2o.import_file(s3_path)
        assert_frame_equal(local_frame.as_data_frame(),
                           s3_frame.as_data_frame())

    def test_s3a_import_export(self):
        self.s3_import_export("s3a")

    @unittest.skip("skip")
    def test_s3n_import_export(self):
        self.s3_import_export("s3n")


if __name__ == '__main__':
    generic_test_utils.run_tests([HadoopSmokeTestSuite],
                                 file_name="py_hadoop_smoke_tests_report")
Exemple #9
0
        env = IntegTestEnv()
        env.set_spark_master("local[*]")
        env.conf("spark.ext.h2o.port.base", 63331)

        return_code = launch(env,
                             "examples/pipelines/ham_or_spam_multi_algo.py",
                             "automl")
        self.assertTrue(
            return_code == 0,
            "Process ended in a wrong way. It ended with return code " +
            str(return_code))

    def test_import_pysparkling_standalone_app(self):
        env = IntegTestEnv()
        env.set_spark_master("local[*]")
        env.conf("spark.ext.h2o.port.base", 63331)

        return_code = launch(
            env,
            "examples/scripts/tests/pysparkling_ml_import_overrides_spark_test.py"
        )
        self.assertTrue(
            return_code == 0,
            "Process ended in a wrong way. It ended with return code " +
            str(return_code))


if __name__ == '__main__':
    generic_test_utils.run_tests([LocalIntegTestSuite],
                                 file_name="py_integ_local_tests_report")
        env.set_spark_master("local[*]")
        # Configure YARN environment
        env.conf("spark.yarn.max.executor.failures", 1) # In fail of executor, fail the test
        env.conf("spark.executor.instances", 1)
        env.conf("spark.executor.memory", "2g")
        env.conf("spark.ext.h2o.port.base", 63331)
        env.conf("spark.driver.memory", "2g")

        return_code = launch(env, "examples/scripts/tests/xgboost_test_medium.py")
        self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code "+str(return_code))

    def test_chicago_crime(self):
        env = IntegTestEnv()

        env.set_spark_master("local[*]")
        # Configure YARN environment
        env.conf("spark.yarn.max.executor.failures", 1) # In fail of executor, fail the test
        env.conf("spark.executor.instances", 1)
        env.conf("spark.executor.memory", "2g")
        env.conf("spark.ext.h2o.port.base", 63331)
        env.conf("spark.driver.memory", "2g")

        return_code = launch(env, "examples/scripts/ChicagoCrimeDemo.py")
        self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code "+str(return_code))



if __name__ == '__main__':
        generic_test_utils.run_tests([YarnIntegTestSuite], file_name="py_integ_yarn_tests_report")
            featuresCols=["sepal_len", "sepal_wid", "petal_len", "petal_wid"],
            withDetailedPredictionCol=True)

        model = algo.fit(self.dataset)
        transformed = model.transform(self.dataset)
        self.assertEquals(
            transformed.select("detailed_prediction.cluster").head()[0], 0,
            "Prediction should match")
        self.assertEquals(
            len(transformed.select("detailed_prediction.distances").head()[0]),
            3, "Size of distances array should match")

    def testUserPoints(self):
        algo = H2OKMeans(
            splitRatio=0.8,
            seed=1,
            k=3,
            featuresCols=["sepal_len", "sepal_wid", "petal_len", "petal_wid"],
            userPoints=[[4.9, 3.0, 1.4, 0.2], [5.6, 2.5, 3.9, 1.1],
                        [6.5, 3.0, 5.2, 2.0]])

        model = algo.fit(self.dataset)
        self.assertEquals(
            model.transform(self.dataset).select("prediction").head()[0], 0,
            "Prediction should match")


if __name__ == '__main__':
    generic_test_utils.run_tests([H2OKMeansTestSuite],
                                 file_name="py_unit_tests_kmeans_report")
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
 Integration tests for pySparkling for spark running in Standalone mode
"""
import generic_test_utils
from integ_test_utils import *
import unittest

class StandaloneIntegTestSuite(unittest.TestCase):
    pass


if __name__ == '__main__':
    generic_test_utils.run_tests([StandaloneIntegTestSuite], file_name="py_integ_standalone_tests_report")
Exemple #13
0
        return_code = launch(self._conf,
                             "examples/pipelines/ham_or_spam_multi_algo.py",
                             param="xgboost")
        self.assertTrue(
            return_code == 0,
            "Process ended in a wrong way. It ended with return code " +
            str(return_code))

    def test_pipeline_automl(self):
        return_code = launch(self._conf,
                             "examples/pipelines/ham_or_spam_multi_algo.py",
                             param="automl")
        self.assertTrue(
            return_code == 0,
            "Process ended in a wrong way. It ended with return code " +
            str(return_code))

    def test_import_pysparkling_standalone_app(self):
        return_code = launch(
            self._conf,
            "examples/scripts/tests/pysparkling_ml_import_overrides_spark_test.py"
        )
        self.assertTrue(
            return_code == 0,
            "Process ended in a wrong way. It ended with return code " +
            str(return_code))


if __name__ == '__main__':
    run_tests([LocalIntegTestSuite], file_name="py_integ_local_tests_report")
    def testProducedMOJOModelAndLoadedMOJOModelReturnsSameResult(self):
        targetEncoder = H2OTargetEncoder(labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"])
        pipeline = Pipeline(stages=[targetEncoder])
        producedModel = pipeline.fit(self._trainingDataset)
        path = "file://" + os.path.abspath("build/testProducedMOJOModelAndLoadedMOJOModelReturnsSameResult")
        producedModel.write().overwrite().save(path)
        loadedModel = PipelineModel.load(path)

        transformedByProducedModel = producedModel.transform(self._testingDataset)
        transformedByLoadedModel = loadedModel.transform(self._testingDataset)

        unit_test_utils.assert_data_frames_are_identical(transformedByProducedModel, transformedByLoadedModel)

    def testTargetEncoderModelWithDisabledNoiseAndTargetEncoderMOJOModelTransformTheTrainingDatasetSameWay(self):
        targetEncoder = H2OTargetEncoder()\
            .setInputCols(["RACE", "DPROS", "DCAPS"])\
            .setLabelCol("CAPSULE")\
            .setHoldoutStrategy("None")\
            .setNoise(0.0)
        targetEncoderModel = targetEncoder.fit(self._trainingDataset)

        transformedByModel = targetEncoderModel.transformTrainingDataset(self._trainingDataset)
        transformedByMOJOModel = targetEncoderModel.transform(self._trainingDataset)

        unit_test_utils.assert_data_frames_are_identical(transformedByModel, transformedByMOJOModel)


if __name__ == '__main__':
    generic_test_utils.run_tests([H2OTargetEncoderTestSuite], file_name="py_unit_tests_target_encoder_report")
    @classmethod
    def setUpClass(cls):
        conf = get_default_spark_conf(cls._spark_options_from_params)
        conf["spark.master"] = "local[*]"
        conf["spark.submit.pyFiles"] = sys.argv[1]
        cls._conf = conf

    def test_pipeline_gbm_mojo(self):
        return_code = launch(self._conf, "examples/pipelines/ham_or_spam_multi_algo.py", param="gbm")
        self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code))

    def test_pipeline_deep_learning(self):
        return_code = launch(self._conf, "examples/pipelines/ham_or_spam_multi_algo.py", param="dl")
        self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code))

    def test_pipeline_xgboost(self):
        return_code = launch(self._conf, "examples/pipelines/ham_or_spam_multi_algo.py", param="xgboost")
        self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code))

    def test_pipeline_automl(self):
        return_code = launch(self._conf, "examples/pipelines/ham_or_spam_multi_algo.py", param="automl")
        self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code))

    def test_import_pysparkling_standalone_app(self):
        return_code = launch(self._conf, "examples/scripts/tests/pysparkling_ml_import_overrides_spark_test.py")
        self.assertTrue(return_code == 0, "Process ended in a wrong way. It ended with return code " + str(return_code))


if __name__ == '__main__':
    run_tests([LocalIntegTestSuite], file_name="py_integ_local_tests_report")