def testStopAndStartAgain(spark): import subprocess def listYarnApps(): return str( subprocess.check_output("yarn application -list", shell=True)) def yarnLogs(appId): return str( subprocess.check_output("yarn logs -applicationId " + appId, shell=True)) context1 = H2OContext.getOrCreate(createH2OConf()) yarnAppId1 = str( context1.getConf().get("spark.ext.h2o.external.yarn.app.id")) assert yarnAppId1 in listYarnApps() context1.stop() assert context1.__str__().startswith( "H2OContext has been stopped or hasn't been created.") context2 = H2OContext.getOrCreate(createH2OConf()) yarnAppId2 = str( context2.getConf().get("spark.ext.h2o.external.yarn.app.id")) assert yarnAppId1 not in listYarnApps() assert "Orderly shutdown: Shutting down now." in yarnLogs(yarnAppId1) assert yarnAppId2 in listYarnApps() context2.stop()
def testConversionWorksAfterNewlyStartedContext(spark): context1 = H2OContext.getOrCreate(spark, createH2OConf(spark)) context1.stop() context2 = H2OContext.getOrCreate(spark, createH2OConf(spark)) rdd = spark.sparkContext.parallelize([0.5, 1.3333333333, 178]) h2o_frame = context2.asH2OFrame(rdd) assert h2o_frame[0, 0] == 0.5 assert h2o_frame[1, 0] == 1.3333333333 asert_h2o_frame(h2o_frame, rdd) context2.stop()
def testH2OContextGetOrCreateReturnsReferenceToTheSameClusterIfStartedAutomatically(spark): context1 = H2OContext.getOrCreate(spark, createH2OConf(spark)) context2 = H2OContext.getOrCreate(spark, createH2OConf(spark)) getNodes = lambda context: context._jhc.h2oContext().getH2ONodes() toIpPort = lambda node: node.ipPort() nodesToString = lambda nodes: ', '.join(nodes) nodes1 = map(toIpPort, getNodes(context1)) nodes2 = map(toIpPort, getNodes(context2)) assert nodesToString(nodes1) == nodesToString(nodes2) context1.stop()
def setUpClass(cls): cls._spark = SparkSession.builder.config( conf=test_utils.get_default_spark_conf().set( "spark.ext.h2o.cloud.name", "test-cloud")).getOrCreate() test_utils.set_up_class(cls) h2o_conf = H2OConf(cls._spark).set_num_of_external_h2o_nodes(2) cls._hc = H2OContext.getOrCreate(cls._spark, h2o_conf)
def setUpClass(cls): cls._spark = SparkSession.builder.config( conf=unit_test_utils.get_default_spark_conf()).getOrCreate() unit_test_utils.set_up_class(cls) cls._hc = H2OContext.getOrCreate( cls._spark, H2OConf(cls._spark).set_num_of_external_h2o_nodes(2))
def __init__(self, foldCol=None, labelCol="label", inputCols=[], holdoutStrategy="None", blendedAvgEnabled=False, blendedAvgInflectionPoint=10.0, blendedAvgSmoothing=20.0, noise=0.01, noiseSeed=-1): super(H2OTargetEncoder, self).__init__() self._hc = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False) self._java_obj = self._new_java_obj( "ai.h2o.sparkling.ml.features.H2OTargetEncoder", self.uid) self._setDefault(foldCol=None, labelCol="label", inputCols=[], holdoutStrategy="None", blendedAvgEnabled=False, blendedAvgInflectionPoint=10.0, blendedAvgSmoothing=20.0, noise=0.01, noiseSeed=-1) kwargs = get_input_kwargs(self) self.setParams(**kwargs)
def test_convert_empty_rdd(self): schema = StructType([]) empty = self._spark.createDataFrame(self._spark.sparkContext.emptyRDD(), schema) hc = H2OContext.getOrCreate(self._spark) fr = hc.as_h2o_frame(empty) assert fr.nrows == 0 assert fr.ncols == 0
def test_convert_empty_dataframe_non_empty_schema(self): schema = StructType([StructField("name", StringType()), StructField("age", IntegerType())]) empty = self._spark.createDataFrame(self._spark.sparkContext.emptyRDD(), schema) hc = H2OContext.getOrCreate(self._spark) fr = hc.as_h2o_frame(empty) assert fr.nrows == 0 assert fr.ncols == 2
def setUpClass(cls): cls._conf = unit_test_utils.get_default_spark_conf( cls._spark_options_from_params) cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate() cls._hc = H2OContext.getOrCreate( cls._spark, H2OConf(cls._spark).set_num_of_external_h2o_nodes(1))
def setUpClass(cls): cls._conf = unit_test_utils.get_default_spark_conf( cls._spark_options_from_params) cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate() cls._hc = H2OContext.getOrCreate( cls._spark, H2OConf(cls._spark).set_cluster_size(1))
def setUpClass(cls): cls._cloud_name = generic_test_utils.unique_cloud_name("h2o_conf_test") cls._conf = unit_test_utils.get_default_spark_conf(cls._spark_options_from_params). \ set("spark.ext.h2o.cloud.name", cls._cloud_name) cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate() cls._hc = H2OContext.getOrCreate( cls._spark, H2OConf(cls._spark).set_cluster_size(1))
def setUpClass(cls): cls._spark = SparkSession.builder.config( conf=unit_test_utils.get_default_spark_conf().setMaster( "yarn-client")).getOrCreate() unit_test_utils.set_up_class(cls) cls._hc = H2OContext.getOrCreate( cls._spark, H2OConf(cls._spark).set_cluster_size(1))
def setUpClass(cls): cls._cloud_name = generic_test_utils.unique_cloud_name("h2o_conf_test") cls._spark = SparkSession.builder.config( conf=unit_test_utils.get_default_spark_conf().set( "spark.ext.h2o.cloud.name", cls._cloud_name)).getOrCreate() unit_test_utils.set_up_class(cls) h2o_conf = H2OConf(cls._spark).set_num_of_external_h2o_nodes(2) cls._hc = H2OContext.getOrCreate(cls._spark, h2o_conf)
def hc(spark): conf = H2OConf(spark) conf.set_cluster_size(1) conf.set("spark.ext.h2o.rest.api.based.client", "true") conf.use_auto_cluster_start() conf.set_external_cluster_mode() conf.set_h2o_node_web_enabled() return H2OContext.getOrCreate(spark, conf)
def setMissingValuesHandling(self, value): if value is not None: assert_is_type(value, None, Enum("MeanImputation", "Skip")) jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False)._jvm correct_case_value = get_correct_case_enum(jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.MissingValuesHandling.values(), value) return self._set(missingValuesHandling=jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.MissingValuesHandling.valueOf(correct_case_value)) else: return self._set(missingValuesHandling=None)
def testDownloadLogsAsZIP(spark): hc = H2OContext.getOrCreate(spark, createH2OConf(spark)) path = hc.download_h2o_logs("build", "ZIP") import zipfile archive = zipfile.ZipFile(path, 'r') # The zip should have nested zip files for each node in the cluster + 1 for the parent directory assert len(archive.namelist()) == 2 hc.stop()
def setExcludeAlgos(self, value): # H2O typechecks does not check for case sensitivity java_enums = [] if value is not None: for algo in value: assert_is_type(algo, Enum("GLM", "DRF", "GBM", "DeepLearning", "StackedEnsemble")) jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False)._jvm java_enums.append(get_correct_case_enum(jvm.ai.h2o.automl.AutoML.algo.values(), algo)) return self._set(excludeAlgos=java_enums)
def setUpClass(cls): conf = SparkConf().setAppName( "pyunit-test").setMaster("local-cluster[3,1,2048]").set( "spark.ext.h2o.disable.ga", "true").set("spark.driver.memory", "2g").set("spark.executor.memory", "2g").set( "spark.ext.h2o.client.log.level", "DEBUG") cls._sc = SparkContext(conf=conf) cls._hc = H2OContext(cls._sc).start()
def setBooster(self, value): assert_is_type(value, Enum("gbtree", "gblinear", "dart")) jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False)._jvm correct_case_value = get_correct_case_enum( jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.Booster.values( ), value) return self._set(booster=jvm.hex.tree.xgboost.XGBoostModel. XGBoostParameters.Booster.valueOf(correct_case_value))
def testDownloadLogsAsLOG(spark): hc = H2OContext.getOrCreate(spark, createH2OConf(spark)) path = hc.download_h2o_logs("build", "LOG") clusterName = hc._conf.cloudName() with open(path, 'r') as f: lines = list(filter(lambda line: "INFO: H2O cloud name: '" + clusterName + "'" in line, f.readlines())) assert len(lines) >= 1 hc.stop()
def setBackend(self, value): assert_is_type(value, Enum("auto", "gpu", "cpu")) jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False)._jvm correct_case_value = get_correct_case_enum( jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.Backend.values( ), value) return self._set(backend=jvm.hex.tree.xgboost.XGBoostModel. XGBoostParameters.Backend.valueOf(correct_case_value))
def testAuthFailsWhenUsernamePasswordNotSpecified(spark): with open('build/login.conf', 'w') as f: f.write('user:pass') conf = createH2OConf(spark) conf.setHashLoginEnabled() conf.setCloudName("test-cluster") conf.setClusterConfigFile("notify_file.txt") conf.setLoginConf("build/login.conf") with pytest.raises(Exception): H2OContext.getOrCreate(spark, conf) # No app should be running assert "Total number of applications (application-types: [] and states: [SUBMITTED, ACCEPTED, RUNNING]):0" in listYarnApps() conf.setUserName("user") conf.setPassword("pass") context = H2OContext.getOrCreate(spark, conf) context.stop()
def setTreeMethod(self, value): assert_is_type(value, Enum("auto", "exact", "approx", "hist")) jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False)._jvm correct_case_value = get_correct_case_enum( jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.TreeMethod. values(), value) return self._set( treeMethod=jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters. TreeMethod.valueOf(correct_case_value))
def setNormalizeType(self, value): assert_is_type(value, Enum("tree", "forest")) jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False)._jvm correct_case_value = get_correct_case_enum( jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters. DartNormalizeType.values(), value) return self._set( normalizeType=jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters. DartNormalizeType.valueOf(correct_case_value))
def setDmatrixType(self, value): assert_is_type(value, Enum("auto", "dense", "sparse")) jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False)._jvm correct_case_value = get_correct_case_enum( jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.DMatrixType. values(), value) return self._set( dmatrixType=jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters. DMatrixType.valueOf(correct_case_value))
def setGrowPolicy(self, value): assert_is_type(value, Enum("depthwise", "lossguide")) jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False)._jvm correct_case_value = get_correct_case_enum( jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.GrowPolicy. values(), value) return self._set( growPolicy=jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters. GrowPolicy.valueOf(correct_case_value))
def setDistribution(self, value): assert_is_type( value, None, Enum("AUTO", "bernoulli", "quasibinomial", "modified_huber", "multinomial", "ordinal", "gaussian", "poisson", "gamma", "tweedie", "huber", "laplace", "quantile")) jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False)._jvm correct_case_value = get_correct_case_enum( jvm.hex.genmodel.utils.DistributionFamily.values(), value) return self._set(distribution=jvm.hex.genmodel.utils. DistributionFamily.valueOf(correct_case_value))
def setUpClass(cls): cls._conf = unit_test_utils.get_default_spark_conf( cls._spark_options_from_params) cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate() cls._hc = H2OContext.getOrCreate( cls._spark, H2OConf(cls._spark).set_cluster_size(1)) cls.dataset = cls._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/iris/iris_wheader.csv"), header=True, inferSchema=True)
def testSSL(spark): conf = createH2OConf(spark) conf.setInternalSecureConnectionsEnabled() context = H2OContext.getOrCreate(spark, conf) path = context.download_h2o_logs("build", "LOG") with open(path, 'r') as f: originalLines = f.readlines() lines = list(filter(lambda line: "H2O node running in encrypted mode using" in line, originalLines)) assert len(lines) >= 1 context.stop()
def setStoppingMetric(self, value): # H2O typechecks does not check for case sensitivity assert_is_type( value, Enum("AUTO", "deviance", "logloss", "MSE", "RMSE", "MAE", "RMSLE", "AUC", "lift_top_group", "misclassification", "mean_per_class_error", "custom")) jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(), verbose=False)._jvm correct_case_value = get_correct_case_enum( jvm.hex.ScoreKeeper.StoppingMetric.values(), value) return self._set(stoppingMetric=jvm.hex.ScoreKeeper.StoppingMetric. valueOf(correct_case_value))
def setUpClass(cls): cls._spark = SparkSession.builder.config(conf = test_utils.get_default_spark_conf()).getOrCreate() test_utils.set_up_class(cls) cls._hc = H2OContext.getOrCreate(cls._spark, H2OConf(cls._spark).set_num_of_external_h2o_nodes(2))
def setUpClass(cls): cls._cloud_name = generic_test_utils.unique_cloud_name("h2o_conf_test") cls._conf = unit_test_utils.get_default_spark_conf(cls._spark_options_from_params). \ set("spark.ext.h2o.cloud.name", cls._cloud_name) cls._spark = SparkSession.builder.config(conf=cls._conf).getOrCreate() cls._hc = H2OContext.getOrCreate(cls._spark, H2OConf(cls._spark).set_num_of_external_h2o_nodes(1))
def setUpClass(cls): cls._spark = SparkSession.builder.config(conf = test_utils.get_default_spark_conf().set("spark.ext.h2o.cloud.name", "test-cloud")).getOrCreate() test_utils.set_up_class(cls) h2o_conf = H2OConf(cls._spark).set_num_of_external_h2o_nodes(2) cls._hc = H2OContext.getOrCreate(cls._spark, h2o_conf)
def setUpClass(cls): cls._spark = SparkSession.builder.config( conf=unit_test_utils.get_default_spark_conf().setMaster("yarn-client")).getOrCreate() unit_test_utils.set_up_class(cls) cls._hc = H2OContext.getOrCreate(cls._spark, H2OConf(cls._spark).set_num_of_external_h2o_nodes(1))