Esempio n. 1
0
def testPy4jGatewayConnection(integ_spark_conf):
    token = "my_super_secret_token"
    generateSSLFiles(token)
    startJavaGateway(integ_spark_conf, token)
    spark = obtainSparkSession(token)
    spark.sparkContext.parallelize([1, 2, 3, 4, 5]).collect()
    from pysparkling import H2OContext
    hc = H2OContext.getOrCreate()
    print(hc)
    hc.stop()
Esempio n. 2
0
    def __init__(self, sparkSession, useH2O=False, _unit_testing=False):
        """
        "Automagically" find the JDBC URL and establish a connection
        to the current Splice Machine database
        :param sparkSession: the sparksession object
        :param useH2O: whether or not to
        :param _unit_testing: whether or not we are unit testing
        """
        PySpliceContext.__init__(self, self.get_jdbc_url(), sparkSession, _unit_testing)

        if useH2O:
            from pysparkling import H2OConf, H2OContext
            h2oConf = H2OConf(sparkSession)
            h2oConf.set_fail_on_unsupported_spark_param_disabled()
            self.hc = H2OContext.getOrCreate(sparkSession, h2oConf)
Esempio n. 3
0
 def __init__(self, sparkSession, useH2O=False, _unit_testing=False):
     """
     Automatically find the JDBC URL and establish a connection
     to the current Splice Machine database
     :param sparkSession: the sparksession object
     :param useH2O: whether or not to
     :param _unit_testing: whether or not we are unit testing
     """
     try:
         url = os.environ['JDBC_URL']
         PySpliceContext.__init__(self, url, sparkSession, _unit_testing)
     except Exception as e:
         print(e)
         print(
             'The SpliceMLContext is only for use on the cloud service. Please import and use the PySpliceContext instead.\nUsage:\n\tfrom splicemachine.spark.context import PySpliceContext\n\tsplice = PySpliceContext(jdbc_url, sparkSession)'
         )
         return -1
     if useH2O:
         from pysparkling import H2OConf, H2OContext
         h2oConf = H2OConf(sparkSession)
         h2oConf.set_fail_on_unsupported_spark_param_disabled()
         self.hc = H2OContext.getOrCreate(sparkSession, h2oConf)
Esempio n. 4
0
def get_or_create_h2o_sparkling(h2o_context_params=None,
                                h2o_log_level="ERROR",
                                spark_session_params=None):
    """
    Gets or initiates an H2OSparkling session.

    :param dict h2o_context_params: The parameters based on which the H2OSparkling session is to be initialized
    :param string h2o_log_level: The log level of the H2OSparkling Session
    :param dict spark_session_params: The parameters based on which the Spark session is to be initialized
    :return:
    """

    from pysparkling import H2OConf, H2OContext

    # Start SparkSession
    #TODO possibly change this to create spark session outside and pass "spark" as variable
    from mercury_ml.spark.session import get_or_create_spark_session

    if not spark_session_params:
        spark_session_params = {}

    spark = get_or_create_spark_session(**spark_session_params)

    # Start H2OContext
    h2o_conf = H2OConf(spark)
    h2o_conf.set_h2o_node_log_level(h2o_log_level)

    if not h2o_context_params:
        h2o_context_params = {}

    if h2o_context_params.get("auth"):  # requires h2o-pysparkling>=2.2.28
        h2o_context_params["auth"] = tuple(h2o_context_params["auth"])

    h2o_context = H2OContext.getOrCreate(spark,
                                         conf=h2o_conf,
                                         **h2o_context_params)

    return h2o_context
from h2o.estimators.xgboost import H2OXGBoostEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from pyspark.sql import SparkSession
from pysparkling import H2OContext

from user_definition import *

ss = SparkSession.builder.config('spark.ext.h2o.log.level',
                                 'FATAL').getOrCreate()
ss.sparkContext.setLogLevel('OFF')
hc = H2OContext.getOrCreate()

# step 1
# create spark dataframe
train_df = ss.read.parquet(train_folder).repartition(8).cache()
valid_df = ss.read.parquet(valid_folder).repartition(8).cache()

# convert spark dataframe to h2oFrame
train_h2o = hc.asH2OFrame(train_df, "train")
valid_h2o = hc.asH2OFrame(valid_df, "valid")

# convert label column to categorical datatype
train_h2o['label'] = train_h2o['label'].asfactor()
valid_h2o['label'] = valid_h2o['label'].asfactor()

for i in train_h2o.types:  # dict
    print(f"{i} - {train_h2o.types[i]}")
print('')

# step 2
predictors = train_h2o.names[:]
Esempio n. 6
0
sc.version


# In[1]:


sc.addPyFile("/Users/dt216661/sparkling-water-2.4.5/py/build/dist/h2o_pysparkling_2.4-2.4.5.zip")


# In[2]:


import h2o
from pysparkling import H2OContext
h2o.__version__
hc = H2OContext.getOrCreate(spark)
print(hc)


# # 1.Start H2O cluster inside the Spark environment

# In[4]:


from pysparkling import *
hc = H2OContext.getOrCreate(spark)


# # 2. Parse the data using H2O and convert them to Spark Frame

# In[6]: