Esempio n. 1
0
def testStopAndStartAgain(spark):
    import subprocess

    def listYarnApps():
        return str(
            subprocess.check_output("yarn application -list", shell=True))

    def yarnLogs(appId):
        return str(
            subprocess.check_output("yarn logs -applicationId " + appId,
                                    shell=True))

    context1 = H2OContext.getOrCreate(createH2OConf())
    yarnAppId1 = str(
        context1.getConf().get("spark.ext.h2o.external.yarn.app.id"))
    assert yarnAppId1 in listYarnApps()
    context1.stop()
    assert context1.__str__().startswith(
        "H2OContext has been stopped or hasn't been created.")
    context2 = H2OContext.getOrCreate(createH2OConf())
    yarnAppId2 = str(
        context2.getConf().get("spark.ext.h2o.external.yarn.app.id"))
    assert yarnAppId1 not in listYarnApps()
    assert "Orderly shutdown:  Shutting down now." in yarnLogs(yarnAppId1)
    assert yarnAppId2 in listYarnApps()
    context2.stop()
Esempio n. 2
0
def testConversionWorksAfterNewlyStartedContext(spark):
    context1 = H2OContext.getOrCreate(spark, createH2OConf(spark))
    context1.stop()
    context2 = H2OContext.getOrCreate(spark, createH2OConf(spark))
    rdd = spark.sparkContext.parallelize([0.5, 1.3333333333, 178])
    h2o_frame = context2.asH2OFrame(rdd)
    assert h2o_frame[0, 0] == 0.5
    assert h2o_frame[1, 0] == 1.3333333333
    asert_h2o_frame(h2o_frame, rdd)
    context2.stop()
Esempio n. 3
0
def testH2OContextGetOrCreateReturnsReferenceToTheSameClusterIfStartedAutomatically(spark):
    context1 = H2OContext.getOrCreate(spark, createH2OConf(spark))
    context2 = H2OContext.getOrCreate(spark, createH2OConf(spark))

    getNodes = lambda context: context._jhc.h2oContext().getH2ONodes()
    toIpPort = lambda node: node.ipPort()
    nodesToString = lambda nodes: ', '.join(nodes)

    nodes1 = map(toIpPort, getNodes(context1))
    nodes2 = map(toIpPort, getNodes(context2))

    assert nodesToString(nodes1) == nodesToString(nodes2)
    context1.stop()
Esempio n. 4
0
 def setUpClass(cls):
     cls._spark = SparkSession.builder.config(
         conf=test_utils.get_default_spark_conf().set(
             "spark.ext.h2o.cloud.name", "test-cloud")).getOrCreate()
     test_utils.set_up_class(cls)
     h2o_conf = H2OConf(cls._spark).set_num_of_external_h2o_nodes(2)
     cls._hc = H2OContext.getOrCreate(cls._spark, h2o_conf)
 def setUpClass(cls):
     cls._spark = SparkSession.builder.config(
         conf=unit_test_utils.get_default_spark_conf()).getOrCreate()
     unit_test_utils.set_up_class(cls)
     cls._hc = H2OContext.getOrCreate(
         cls._spark,
         H2OConf(cls._spark).set_num_of_external_h2o_nodes(2))
Esempio n. 6
0
    def __init__(self,
                 foldCol=None,
                 labelCol="label",
                 inputCols=[],
                 holdoutStrategy="None",
                 blendedAvgEnabled=False,
                 blendedAvgInflectionPoint=10.0,
                 blendedAvgSmoothing=20.0,
                 noise=0.01,
                 noiseSeed=-1):
        super(H2OTargetEncoder, self).__init__()
        self._hc = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                          verbose=False)
        self._java_obj = self._new_java_obj(
            "ai.h2o.sparkling.ml.features.H2OTargetEncoder", self.uid)

        self._setDefault(foldCol=None,
                         labelCol="label",
                         inputCols=[],
                         holdoutStrategy="None",
                         blendedAvgEnabled=False,
                         blendedAvgInflectionPoint=10.0,
                         blendedAvgSmoothing=20.0,
                         noise=0.01,
                         noiseSeed=-1)
        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Esempio n. 7
0
 def test_convert_empty_rdd(self):
     schema = StructType([])
     empty = self._spark.createDataFrame(self._spark.sparkContext.emptyRDD(), schema)
     hc = H2OContext.getOrCreate(self._spark)
     fr = hc.as_h2o_frame(empty)
     assert fr.nrows == 0
     assert fr.ncols == 0
Esempio n. 8
0
 def test_convert_empty_dataframe_non_empty_schema(self):
     schema = StructType([StructField("name", StringType()), StructField("age", IntegerType())])
     empty = self._spark.createDataFrame(self._spark.sparkContext.emptyRDD(), schema)
     hc = H2OContext.getOrCreate(self._spark)
     fr = hc.as_h2o_frame(empty)
     assert fr.nrows == 0
     assert fr.ncols == 2
 def setUpClass(cls):
     cls._conf = unit_test_utils.get_default_spark_conf(
         cls._spark_options_from_params)
     cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate()
     cls._hc = H2OContext.getOrCreate(
         cls._spark,
         H2OConf(cls._spark).set_num_of_external_h2o_nodes(1))
Esempio n. 10
0
 def setUpClass(cls):
     cls._conf = unit_test_utils.get_default_spark_conf(
         cls._spark_options_from_params)
     cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate()
     cls._hc = H2OContext.getOrCreate(
         cls._spark,
         H2OConf(cls._spark).set_cluster_size(1))
Esempio n. 11
0
 def setUpClass(cls):
     cls._cloud_name = generic_test_utils.unique_cloud_name("h2o_conf_test")
     cls._conf = unit_test_utils.get_default_spark_conf(cls._spark_options_from_params). \
         set("spark.ext.h2o.cloud.name", cls._cloud_name)
     cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate()
     cls._hc = H2OContext.getOrCreate(
         cls._spark,
         H2OConf(cls._spark).set_cluster_size(1))
Esempio n. 12
0
 def setUpClass(cls):
     cls._spark = SparkSession.builder.config(
         conf=unit_test_utils.get_default_spark_conf().setMaster(
             "yarn-client")).getOrCreate()
     unit_test_utils.set_up_class(cls)
     cls._hc = H2OContext.getOrCreate(
         cls._spark,
         H2OConf(cls._spark).set_cluster_size(1))
Esempio n. 13
0
 def setUpClass(cls):
     cls._cloud_name = generic_test_utils.unique_cloud_name("h2o_conf_test")
     cls._spark = SparkSession.builder.config(
         conf=unit_test_utils.get_default_spark_conf().set(
             "spark.ext.h2o.cloud.name", cls._cloud_name)).getOrCreate()
     unit_test_utils.set_up_class(cls)
     h2o_conf = H2OConf(cls._spark).set_num_of_external_h2o_nodes(2)
     cls._hc = H2OContext.getOrCreate(cls._spark, h2o_conf)
Esempio n. 14
0
def hc(spark):
    conf = H2OConf(spark)
    conf.set_cluster_size(1)
    conf.set("spark.ext.h2o.rest.api.based.client", "true")
    conf.use_auto_cluster_start()
    conf.set_external_cluster_mode()
    conf.set_h2o_node_web_enabled()
    return H2OContext.getOrCreate(spark, conf)
Esempio n. 15
0
 def setMissingValuesHandling(self, value):
     if value is not None:
         assert_is_type(value, None, Enum("MeanImputation", "Skip"))
         jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False)._jvm
         correct_case_value = get_correct_case_enum(jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.MissingValuesHandling.values(), value)
         return self._set(missingValuesHandling=jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.MissingValuesHandling.valueOf(correct_case_value))
     else:
         return self._set(missingValuesHandling=None)
Esempio n. 16
0
def testDownloadLogsAsZIP(spark):
    hc = H2OContext.getOrCreate(spark, createH2OConf(spark))
    path = hc.download_h2o_logs("build", "ZIP")
    import zipfile
    archive = zipfile.ZipFile(path, 'r')
    # The zip should have nested zip files for each node in the cluster + 1 for the parent directory
    assert len(archive.namelist()) == 2
    hc.stop()
Esempio n. 17
0
 def setExcludeAlgos(self, value):
     # H2O typechecks does not check for case sensitivity
     java_enums = []
     if value is not None:
         for algo in value:
             assert_is_type(algo, Enum("GLM", "DRF", "GBM", "DeepLearning", "StackedEnsemble"))
             jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False)._jvm
             java_enums.append(get_correct_case_enum(jvm.ai.h2o.automl.AutoML.algo.values(), algo))
     return self._set(excludeAlgos=java_enums)
Esempio n. 18
0
 def setUpClass(cls):
     conf = SparkConf().setAppName(
         "pyunit-test").setMaster("local-cluster[3,1,2048]").set(
             "spark.ext.h2o.disable.ga",
             "true").set("spark.driver.memory",
                         "2g").set("spark.executor.memory", "2g").set(
                             "spark.ext.h2o.client.log.level", "DEBUG")
     cls._sc = SparkContext(conf=conf)
     cls._hc = H2OContext(cls._sc).start()
Esempio n. 19
0
 def setBooster(self, value):
     assert_is_type(value, Enum("gbtree", "gblinear", "dart"))
     jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                  verbose=False)._jvm
     correct_case_value = get_correct_case_enum(
         jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.Booster.values(
         ), value)
     return self._set(booster=jvm.hex.tree.xgboost.XGBoostModel.
                      XGBoostParameters.Booster.valueOf(correct_case_value))
Esempio n. 20
0
def testDownloadLogsAsLOG(spark):
    hc = H2OContext.getOrCreate(spark, createH2OConf(spark))
    path = hc.download_h2o_logs("build", "LOG")
    clusterName = hc._conf.cloudName()

    with open(path, 'r') as f:
        lines = list(filter(lambda line: "INFO: H2O cloud name: '" + clusterName + "'" in line, f.readlines()))
        assert len(lines) >= 1
    hc.stop()
Esempio n. 21
0
 def setBackend(self, value):
     assert_is_type(value, Enum("auto", "gpu", "cpu"))
     jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                  verbose=False)._jvm
     correct_case_value = get_correct_case_enum(
         jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.Backend.values(
         ), value)
     return self._set(backend=jvm.hex.tree.xgboost.XGBoostModel.
                      XGBoostParameters.Backend.valueOf(correct_case_value))
def testAuthFailsWhenUsernamePasswordNotSpecified(spark):
    with open('build/login.conf', 'w') as f:
        f.write('user:pass')

    conf = createH2OConf(spark)
    conf.setHashLoginEnabled()
    conf.setCloudName("test-cluster")
    conf.setClusterConfigFile("notify_file.txt")
    conf.setLoginConf("build/login.conf")

    with pytest.raises(Exception):
        H2OContext.getOrCreate(spark, conf)
    # No app should be running
    assert "Total number of applications (application-types: [] and states: [SUBMITTED, ACCEPTED, RUNNING]):0" in listYarnApps()
    conf.setUserName("user")
    conf.setPassword("pass")
    context = H2OContext.getOrCreate(spark, conf)
    context.stop()
Esempio n. 23
0
 def setTreeMethod(self, value):
     assert_is_type(value, Enum("auto", "exact", "approx", "hist"))
     jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                  verbose=False)._jvm
     correct_case_value = get_correct_case_enum(
         jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.TreeMethod.
         values(), value)
     return self._set(
         treeMethod=jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.
         TreeMethod.valueOf(correct_case_value))
Esempio n. 24
0
 def setNormalizeType(self, value):
     assert_is_type(value, Enum("tree", "forest"))
     jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                  verbose=False)._jvm
     correct_case_value = get_correct_case_enum(
         jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.
         DartNormalizeType.values(), value)
     return self._set(
         normalizeType=jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.
         DartNormalizeType.valueOf(correct_case_value))
Esempio n. 25
0
 def setDmatrixType(self, value):
     assert_is_type(value, Enum("auto", "dense", "sparse"))
     jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                  verbose=False)._jvm
     correct_case_value = get_correct_case_enum(
         jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.DMatrixType.
         values(), value)
     return self._set(
         dmatrixType=jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.
         DMatrixType.valueOf(correct_case_value))
Esempio n. 26
0
 def setGrowPolicy(self, value):
     assert_is_type(value, Enum("depthwise", "lossguide"))
     jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                  verbose=False)._jvm
     correct_case_value = get_correct_case_enum(
         jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.GrowPolicy.
         values(), value)
     return self._set(
         growPolicy=jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.
         GrowPolicy.valueOf(correct_case_value))
Esempio n. 27
0
 def setDistribution(self, value):
     assert_is_type(
         value, None,
         Enum("AUTO", "bernoulli", "quasibinomial", "modified_huber",
              "multinomial", "ordinal", "gaussian", "poisson", "gamma",
              "tweedie", "huber", "laplace", "quantile"))
     jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                  verbose=False)._jvm
     correct_case_value = get_correct_case_enum(
         jvm.hex.genmodel.utils.DistributionFamily.values(), value)
     return self._set(distribution=jvm.hex.genmodel.utils.
                      DistributionFamily.valueOf(correct_case_value))
 def setUpClass(cls):
     cls._conf = unit_test_utils.get_default_spark_conf(
         cls._spark_options_from_params)
     cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate()
     cls._hc = H2OContext.getOrCreate(
         cls._spark,
         H2OConf(cls._spark).set_cluster_size(1))
     cls.dataset = cls._spark.read.csv(
         "file://" +
         unit_test_utils.locate("smalldata/iris/iris_wheader.csv"),
         header=True,
         inferSchema=True)
def testSSL(spark):
    conf = createH2OConf(spark)
    conf.setInternalSecureConnectionsEnabled()

    context = H2OContext.getOrCreate(spark, conf)
    path = context.download_h2o_logs("build", "LOG")

    with open(path, 'r') as f:
        originalLines = f.readlines()
        lines = list(filter(lambda line: "H2O node running in encrypted mode using" in line, originalLines))
        assert len(lines) >= 1
    context.stop()
Esempio n. 30
0
 def setStoppingMetric(self, value):
     # H2O typechecks does not check for case sensitivity
     assert_is_type(
         value,
         Enum("AUTO", "deviance", "logloss", "MSE", "RMSE", "MAE", "RMSLE",
              "AUC", "lift_top_group", "misclassification",
              "mean_per_class_error", "custom"))
     jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                  verbose=False)._jvm
     correct_case_value = get_correct_case_enum(
         jvm.hex.ScoreKeeper.StoppingMetric.values(), value)
     return self._set(stoppingMetric=jvm.hex.ScoreKeeper.StoppingMetric.
                      valueOf(correct_case_value))
Esempio n. 31
0
 def setUpClass(cls):
     cls._spark = SparkSession.builder.config(conf = test_utils.get_default_spark_conf()).getOrCreate()
     test_utils.set_up_class(cls)
     cls._hc = H2OContext.getOrCreate(cls._spark, H2OConf(cls._spark).set_num_of_external_h2o_nodes(2))
Esempio n. 32
0
 def setUpClass(cls):
     cls._cloud_name = generic_test_utils.unique_cloud_name("h2o_conf_test")
     cls._conf = unit_test_utils.get_default_spark_conf(cls._spark_options_from_params). \
         set("spark.ext.h2o.cloud.name", cls._cloud_name)
     cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate()
     cls._hc = H2OContext.getOrCreate(cls._spark, H2OConf(cls._spark).set_num_of_external_h2o_nodes(1))
Esempio n. 33
0
 def setUpClass(cls):
     cls._spark = SparkSession.builder.config(conf = test_utils.get_default_spark_conf().set("spark.ext.h2o.cloud.name", "test-cloud")).getOrCreate()
     test_utils.set_up_class(cls)
     h2o_conf = H2OConf(cls._spark).set_num_of_external_h2o_nodes(2)
     cls._hc = H2OContext.getOrCreate(cls._spark, h2o_conf)
Esempio n. 34
0
 def setUpClass(cls):
     cls._spark = SparkSession.builder.config(
         conf=unit_test_utils.get_default_spark_conf().setMaster("yarn-client")).getOrCreate()
     unit_test_utils.set_up_class(cls)
     cls._hc = H2OContext.getOrCreate(cls._spark, H2OConf(cls._spark).set_num_of_external_h2o_nodes(1))