Beispiel #1
0
 def __default_h2o_connect(h2o_context, **kwargs):
     if "https" in kwargs:
         warnings.warn("https argument is automatically set up and the specified value will be ignored.")
     schema = h2o_context._jhc.h2oContext().getScheme(h2o_context._jhc.h2oContext()._conf())
     kwargs["https"] = False
     if schema == "https":
         kwargs["https"] = True
     if h2o_context._conf.context_path() is not None:
         url = "{}://{}:{}/{}".format(schema, h2o_context._client_ip, h2o_context._client_port, h2o_context._conf.context_path())
         return h2o.connect(url=url, **kwargs)
     else:
         return h2o.connect(ip=h2o_context._client_ip, port=h2o_context._client_port, **kwargs)
Beispiel #2
0
def start():
    """Starts h2o session if it is not running in the server.
	   If it already is running, then connect to it.
	"""
    try:
        print("[INFO] Checking if H2O is already running in the server.")
        h2o.connect()
    except:
        print("[INFO] H2O is not running. Starting H2O now.")
        os.system('../bin/deploy_h2o.sh')
        time.sleep(3)
        h2o.connect()
    return h2o
Beispiel #3
0
    def getOrCreate(spark, conf=None, **kwargs):
        """
         Get existing or create new H2OContext based on provided H2O configuration. If the conf parameter is set then
         configuration from it is used. Otherwise the configuration properties passed to Sparkling Water are used.
         If the values are not found the default values are used in most of the cases. The default cluster mode
         is internal, ie. spark.ext.h2o.external.cluster.mode=false

         param - Spark Context or Spark Session
         returns H2O Context
        """

        spark_session = spark
        if isinstance(spark, SparkContext):
            warnings.warn(
                "Method H2OContext.getOrCreate with argument of type SparkContext is deprecated and "
                + "parameter of type SparkSession is preferred.")
            spark_session = SparkSession.builder.getOrCreate()

        h2o_context = H2OContext(spark_session)

        jvm = h2o_context._jvm  # JVM
        jspark_session = h2o_context._jspark_session  # Java Spark Session

        if conf is not None:
            selected_conf = conf
        else:
            selected_conf = H2OConf(spark_session)
        # Create backing Java H2OContext
        jhc = jvm.org.apache.spark.h2o.JavaH2OContext.getOrCreate(
            jspark_session, selected_conf._jconf)
        h2o_context._jhc = jhc
        h2o_context._conf = selected_conf
        h2o_context._client_ip = jhc.h2oLocalClientIp()
        h2o_context._client_port = jhc.h2oLocalClientPort()
        # Create H2O REST API client
        h2o.connect(ip=h2o_context._client_ip,
                    port=h2o_context._client_port,
                    **kwargs)
        h2o_context.is_initialized = True

        print(h2o_context)

        # Stop h2o when running standalone pysparkling scripts, only in client deploy mode
        #, so the user does not explicitly close h2o.
        # In driver mode the application would call exit which is handled by Spark AM as failure
        deploy_mode = spark_session.sparkContext._conf.get(
            "spark.submit.deployMode")
        if deploy_mode != "cluster":
            atexit.register(lambda: h2o_context.stop_with_jvm())
        return h2o_context
Beispiel #4
0
 def __init__(self,
              ip: str = '',
              port: str = '',
              settings_file_name: str = 'settings.ini'):
     """Init."""
     self.config = configparser.ConfigParser()
     self.config.read(settings_file_name, encoding='utf-8')
     for key, value in self.config['MAIN'].items():
         setattr(self, key, value)
     h2o.connect(ip=ip,
                 port=port,
                 auth=(self.login, self.password),
                 verbose=False)
     h2o.no_progress()
Beispiel #5
0
def h2o_test_setup(sys_args):
    h2o_py_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),".."))
    h2o_docs_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","h2o-docs"))

    parse_args(sys_args)

    sys.path.insert(1, h2o_py_dir)
    import h2o
    from tests import pyunit_utils, pydemo_utils, pybooklet_utils

    for pkg in (pyunit_utils, pybooklet_utils):
        setattr(pkg, '__on_hadoop__', _ON_HADOOP_)
        setattr(pkg, '__hadoop_namenode__', _HADOOP_NAMENODE_)
        setattr(pkg, '__test_name__', _TEST_NAME_)
        setattr(pkg, '__results_dir__', _RESULTS_DIR_)

    if _IS_PYUNIT_ or _IS_IPYNB_ or _IS_PYBOOKLET_ or _IS_PYDEMO_:
        pass
    else:
        raise(EnvironmentError, "Unrecognized test type. Must be of type ipynb, pydemo, pyunit, or pybooklet, but got: "
                                "{0}".format(_TEST_NAME_))

    print("[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Connect to h2o on IP: {0} PORT: {1}".format(_H2O_IP_, _H2O_PORT_)))
    auth = None
    if _LDAP_USER_NAME_ is not None and _LDAP_PASSWORD_ is not None:
        auth = (_LDAP_USER_NAME_, _LDAP_PASSWORD_)
    elif _KERB_PRINCIPAL_ is not None:
        from h2o.auth import SpnegoAuth
        auth = SpnegoAuth(service_principal=_KERB_PRINCIPAL_)
    h2o.connect(ip=_H2O_IP_, port=_H2O_PORT_, verbose=False, auth=auth, **_H2O_EXTRA_CONNECT_ARGS_)
    h2o.utils.config.H2OConfigReader.get_config()["general.allow_breaking_changes"] = True

    #rest_log = os.path.join(_RESULTS_DIR_, "rest.log")
    #h2o.start_logging(rest_log)
    #print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Started rest logging in: {0}".format(rest_log))

    h2o.log_and_echo("------------------------------------------------------------")
    h2o.log_and_echo("")
    h2o.log_and_echo("STARTING TEST: " + _TEST_NAME_)
    h2o.log_and_echo("")
    h2o.log_and_echo("------------------------------------------------------------")

    h2o.remove_all()

    if _IS_IPYNB_:       pydemo_utils.ipy_notebook_exec(_TEST_NAME_)
    elif _IS_PYUNIT_:    pyunit_utils.pyunit_exec(_TEST_NAME_)
    elif _IS_PYBOOKLET_: pybooklet_utils.pybooklet_exec(_TEST_NAME_)
    elif _IS_PYDEMO_:    pydemo_utils.pydemo_exec(_TEST_NAME_)
Beispiel #6
0
 def __default_h2o_connect(h2o_context, **kwargs):
     schema = h2o_context._jhc.h2oContext().getScheme(
         h2o_context._jhc.h2oContext()._conf())
     https = False
     if schema == "https":
         https = True
     if h2o_context._conf.context_path() is not None:
         url = "{}://{}:{}/{}".format(schema, h2o_context._client_ip,
                                      h2o_context._client_port,
                                      h2o_context._conf.context_path())
         return h2o.connect(url=url, https=https, **kwargs)
     else:
         return h2o.connect(ip=h2o_context._client_ip,
                            port=h2o_context._client_port,
                            https=https,
                            **kwargs)
def predict_churn(State, AccountLength, AreaCode, Phone, IntlPlan, VMailPlan,
                  VMailMessage, DayMins, DayCalls, DayCharge, EveMins,
                  EveCalls, EveCharge, NightMins, NightCalls, NightCharge,
                  IntlMins, IntlCalls, IntlCharge, CustServCalls):
    # connect to the model scoring service
    h2o.connect()

    # open the downloaded model
    ChurnPredictor = h2o.load_model(path='AutoML-leader')

    # define a feature vector to evaluate with the model
    newData = pd.DataFrame(
        {
            'State': State,
            'Account Length': AccountLength,
            'Area Code': AreaCode,
            'Phone': Phone,
            'Int\'l Plan': IntlPlan,
            'VMail Plan': VMailPlan,
            'VMail Message': VMailMessage,
            'Day Mins': DayMins,
            'Day Calls': DayCalls,
            'Day Charge': DayCharge,
            'Eve Mins': EveMins,
            'Eve Calls': EveCalls,
            'Eve Charge': EveCharge,
            'Night Mins': NightMins,
            'Night Calls': NightCalls,
            'Night Charge': NightCharge,
            'Intl Mins': IntlMins,
            'Intl Calls': IntlCalls,
            'Intl Charge': IntlCharge,
            'CustServ Calls': CustServCalls
        },
        index=[0])

    # evaluate the feature vector using the model
    predictions = ChurnPredictor.predict(h2o.H2OFrame(newData))
    predictionsOut = h2o.as_list(predictions, use_pandas=False)
    prediction = predictionsOut[1][0]
    probabilityChurn = predictionsOut[1][1]
    probabilityRetain = predictionsOut[1][2]
    return "Prediction: " + str(prediction) + " |Probability to Churn: " + str(
        probabilityChurn) + " |Probability to Retain: " + str(
            probabilityRetain)
Beispiel #8
0
 def grid_search(training_df, attribute_property_length):
     h2o.init()
     h2o.connect()
     training_array = training_df.values
     x = training_array[:, 0:attribute_property_length]
     y = training_array[:, attribute_property_length - 1]
     tr_df = h2o.H2OFrame(x)
     training_columns = [
         'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10'
     ]
     response_column = 'C11'
     hyper_parameters = {'ntrees': [15, 20, 25], 'max_depth': [15, 20]}
     random_plus_manual = H2OGridSearch(
         H2ORandomForestEstimator(nfolds=n_splits), hyper_parameters)
     random_plus_manual.train(x=training_columns,
                              y=response_column,
                              training_frame=tr_df)
     random_plus_manual.show()
Beispiel #9
0
 def __default_h2o_connect(h2o_context, **kwargs):
     if "https" in kwargs:
         warnings.warn(
             "https argument is automatically set up and the specified value will be ignored."
         )
     schema = h2o_context._jhc.h2oContext().getConf().getScheme()
     kwargs["https"] = False
     if schema == "https":
         kwargs["https"] = True
     if h2o_context._conf.context_path() is not None:
         url = "{}://{}:{}/{}".format(schema, h2o_context._client_ip,
                                      h2o_context._client_port,
                                      h2o_context._conf.context_path())
         return h2o.connect(url=url, **kwargs)
     else:
         return h2o.connect(ip=h2o_context._client_ip,
                            port=h2o_context._client_port,
                            **kwargs)
def churn_predict_batch(batchFile):
    # connect to the model scoring service
    h2o.connect(verbose=False)

    # load the user-specified file
    newData = h2o.import_file(batchFile)

    # open the downloaded model
    ChurnPredictor = h2o.load_model(path=downloaded_model)

    # evaluate the feature vector using the model
    predictions = ChurnPredictor.predict(newData)
    predictions = newData.cbind(predictions)
    h2o.download_csv(predictions, 'predictions.csv')

    upload_file_to_s3('predictions.csv')
    successMessage2 = "Predictions saved  https://s3-us-west-1.amazonaws.com/dsclouddata/home/jupyter/predictions.csv"
    return successMessage2
Beispiel #11
0
    def get_h2o_predictions(training_dataset, testing_dataset,
                            attribute_property_length):
        training_df = RandomForest.get_property_data_frame(
            training_dataset, one_hot_encoding=False)
        testing_df = RandomForest.get_property_data_frame(
            testing_dataset, one_hot_encoding=False)
        h2o.init()
        h2o.connect()
        training_array = training_df.values
        testing_array = testing_df.values
        x = training_array[:, 0:attribute_property_length]
        y = training_array[:, attribute_property_length - 1]

        x_test = testing_array[:, 0:attribute_property_length]
        ts_df = h2o.H2OFrame(x_test)
        tr_df = h2o.H2OFrame(x)

        training_columns = [
            'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',
            'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20',
            'C21', 'C22'
        ]
        response_column = 'C23'
        model = H2ORandomForestEstimator(ntrees=num_trees,
                                         max_depth=max_depth,
                                         nfolds=n_splits)
        model.train(x=training_columns,
                    y=response_column,
                    training_frame=tr_df,
                    validation_frame=ts_df)
        predictions = model.predict(ts_df)
        model.show()
        print(model.varimp(True))
        predictions_array = []
        probabilities_array = []
        predictions = predictions.as_data_frame().values.tolist()
        for i in range(len(predictions)):
            if predictions[i][0] >= 0.5:
                predictions_array.append(1.0)
            else:
                predictions_array.append(0.0)
            probabilities_array.append(predictions[i][0])
        return predictions_array, probabilities_array
Beispiel #12
0
def h2oconnect():
    """
    Python API test: h2o.connect(server=None, url=None, ip=None, port=None, https=None, verify_ssl_certificates=None,
     auth=None, proxy=None,cookies=None, verbose=True)
    """
    ipA = "127.0.0.1"
    portN = "54321"
    urlS = "http://127.0.0.1:54321"

    try:
        connect_type=h2o.connect(ip = ipA, port = portN, verbose = True)
        assert_is_type(connect_type, H2OConnection)
    except Exception as e:  # port number may not match.  Make sure the right error message is returned
        assert 'Could not establish link' in e.args[0], "h2o.connect command is not working."

    try:
        connect_type2 = h2o.connect(url=urlS, https=True, verbose = True)     # pass if no connection issue
        assert_is_type(connect_type2, H2OConnection)
    except Exception as e:  # port number may not match.  Make sure the right error message is returned
        assert 'Could not establish link' in e.args[0], "h2o.connect command is not working."
Beispiel #13
0
def test_cacert_in_config():
    cfg = {
        "ip": "self-signed.badssl.com",
        "port": 443,
        "verify_ssl_certificates": True,
        "https": True
    }
    try:
        h2o.connect(config=cfg)
        assert False
    except H2OConnectionError as e:
        assert "CERTIFICATE_VERIFY_FAILED" in str(e)

    cfg["cacert"] = pyunit_utils.locate(
        "smalldata/certs/badssl-cacert-2020.pem")
    try:
        h2o.connect(config=cfg)
        assert False
    except H2OConnectionError as e:
        # any response is a good response - TLS handshake was successful which proves the certificate was used
        assert "HTTP 404 Not Found" in str(e)
def approve_loan(Loan_Amount, Term, Interest_Rate, Employment_Years,
                 Home_Ownership, Annual_Income, Verification_Status,
                 Loan_Purpose, State, Debt_to_Income, Delinquent_2yr,
                 Revolving_Cr_Util, Total_Accounts, Longest_Credit_Length):
    # connect to the model scoring service
    h2o.connect()

    # open the downloaded model
    ChurnPredictor = h2o.load_model(path='DRF_model_1496459915419_4')

    # define a feature vector to evaluate with the model
    newData = pd.DataFrame(
        {
            'Loan_Amount': Loan_Amount,
            'Term': Term,
            'Interest_Rate': Interest_Rate,
            'Employment_Years': Employment_Years,
            'Home_Ownership': Home_Ownership,
            'Annual_Income': Annual_Income,
            'Verification_Status': Verification_Status,
            'Loan_Purpose': Loan_Purpose,
            'State': State,
            'Debt_to_Income': Debt_to_Income,
            'Delinquent_2yr': Delinquent_2yr,
            'Revolving_Cr_Util': Revolving_Cr_Util,
            'Total_Accounts': Total_Accounts,
            'Longest_Credit_Length': Longest_Credit_Length
        },
        index=[0])

    # evaluate the feature vector using the model
    predictions = ChurnPredictor.predict(h2o.H2OFrame(newData))
    predictionsOut = h2o.as_list(predictions, use_pandas=False)
    prediction = predictionsOut[1][0]
    probabilityBad = predictionsOut[1][1]
    probabilityGood = predictionsOut[1][2]
    return "Prediction: " + str(
        prediction) + " |Probability of Bad Loan: " + str(
            probabilityBad) + " |Probability of Good Loan: " + str(
                probabilityGood)
Beispiel #15
0
def h2o_test_setup(sys_args):
    h2o_py_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),".."))
    h2o_docs_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","h2o-docs"))

    parse_args(sys_args)

    sys.path.insert(1, h2o_py_dir)
    import h2o
    from tests import pyunit_utils, pydemo_utils, pybooklet_utils

    set_pyunit_pkg_attrs(pyunit_utils)
    set_pybooklet_pkg_attrs(pybooklet_utils)

    if _IS_PYUNIT_ or _IS_IPYNB_ or _IS_PYBOOKLET_ or _IS_PYDEMO_:
        pass
    else:
        raise(EnvironmentError, "Unrecognized test type. Must be of type ipynb, pydemo, pyunit, or pybooklet, but got: "
                                "{0}".format(_TEST_NAME_))

    print("[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Connect to h2o on IP: {0} PORT: {1}".format(_H2O_IP_, _H2O_PORT_)))
    h2o.connect(ip=_H2O_IP_, port=_H2O_PORT_, verbose=False)
    h2o.utils.config.H2OConfigReader.get_config()["general.allow_breaking_changes"] = True

    #rest_log = os.path.join(_RESULTS_DIR_, "rest.log")
    #h2o.start_logging(rest_log)
    #print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Started rest logging in: {0}".format(rest_log))

    h2o.log_and_echo("------------------------------------------------------------")
    h2o.log_and_echo("")
    h2o.log_and_echo("STARTING TEST: " + _TEST_NAME_)
    h2o.log_and_echo("")
    h2o.log_and_echo("------------------------------------------------------------")

    h2o.remove_all()

    if _IS_IPYNB_:       pydemo_utils.ipy_notebook_exec(_TEST_NAME_)
    elif _IS_PYUNIT_:    pyunit_utils.pyunit_exec(_TEST_NAME_)
    elif _IS_PYBOOKLET_: pybooklet_utils.pybooklet_exec(_TEST_NAME_)
    elif _IS_PYDEMO_:    pydemo_utils.pydemo_exec(_TEST_NAME_)
Beispiel #16
0
    def getOrCreate(spark, conf=None, **kwargs):
        """
         Get existing or create new H2OContext based on provided H2O configuration. If the conf parameter is set then
         configuration from it is used. Otherwise the configuration properties passed to Sparkling Water are used.
         If the values are not found the default values are used in most of the cases. The default cluster mode
         is internal, ie. spark.ext.h2o.external.cluster.mode=false

         param - Spark Context or Spark Session
         returns H2O Context
        """

        spark_session = spark
        if isinstance(spark, SparkContext):
            warnings.warn("Method H2OContext.getOrCreate with argument of type SparkContext is deprecated and " +
                          "parameter of type SparkSession is preferred.")
            spark_session = SparkSession.builder.getOrCreate()

        h2o_context = H2OContext(spark_session)

        jvm = h2o_context._jvm  # JVM
        jsc = h2o_context._jsc  # JavaSparkContext

        if conf is not None:
            selected_conf = conf
        else:
            selected_conf = H2OConf(spark_session)
        # Create backing Java H2OContext
        jhc = jvm.org.apache.spark.h2o.JavaH2OContext.getOrCreate(jsc, selected_conf._jconf)
        h2o_context._jhc = jhc
        h2o_context._conf = selected_conf
        h2o_context._client_ip = jhc.h2oLocalClientIp()
        h2o_context._client_port = jhc.h2oLocalClientPort()
        # Create H2O REST API client
        h2o.connect(ip=h2o_context._client_ip, port=h2o_context._client_port, **kwargs)
        h2o_context.is_initialized = True
        # Stop h2o when running standalone pysparkling scripts and the user does not explicitly close h2o
        atexit.register(lambda: h2o_context.stop_with_jvm())
        return h2o_context
    def __h2o_connect(h2o_context):
        schema = h2o_context._jhc.getConf().getScheme()
        conf = h2o_context._conf

        kwargs = {}
        kwargs["https"] = schema == "https"
        kwargs["verify_ssl_certificates"] = conf.verifySslCertificates()
        if conf.userName() and conf.password():
            kwargs["auth"] = (conf.userName(), conf.password())
        url = "{}://{}:{}".format(schema, h2o_context._client_ip,
                                  h2o_context._client_port)
        if conf.contextPath() is not None:
            url = "{}/{}".format(url, conf.contextPath())
        return h2o.connect(url=url, **kwargs)
Beispiel #18
0
def h2oconnect():
    """
    Python API test: h2o.connect(server=None, url=None, ip=None, port=None, https=None, verify_ssl_certificates=None,
     auth=None, proxy=None, cluster_id=None, cookies=None, verbose=True)
    """
    ipA = "127.0.0.1"
    portN = "54321"
    urlS = "http://127.0.0.1:54321"

    try:
        connect_type = h2o.connect(ip=ipA, port=portN, verbose=True)
        assert_is_type(connect_type, H2OConnection)
    except Exception as e:  # port number may not match.  Make sure the right error message is returned
        assert 'Could not establish link' in e.args[
            0], "h2o.connect command is not working."

    try:
        connect_type2 = h2o.connect(
            url=urlS, https=True, verbose=True)  # pass if no connection issue
        assert_is_type(connect_type2, H2OConnection)
    except Exception as e:  # port number may not match.  Make sure the right error message is returned
        assert 'Could not establish link' in e.args[
            0], "h2o.connect command is not working."
Beispiel #19
0
    def do_h2o_kmeans(self, dataset, server_url):
        """use the h2o module to perform k-means clustering.
            This method delegates clustering to a H2O server instance(local or remote). A connection attempt will be
            made to the provided server_url before clustering is initiated.
            input:
                :param dataset: input data - term document matrix
                :param server_url: URL of the H2O server instance on which clustering would run
            output:
                labels_: a list of cluster identifiers - 1 per input document
            :raises ConnectionError"""

        # establish connection to H20 server
        try:
            h2o.connect(url=server_url, verbose=False)
            logging.info("connected to H2O server")
            h2o_dataframe = h2o.H2OFrame(python_obj=dataset)
            self.model = H2OKMeansEstimator(max_iterations=self.config.NITER, k=self.config.NCLUSTERS, init="PlusPlus",
                                            standardize=False)
            self.model.train(training_frame=h2o_dataframe)
            logging.info("modelling complete. predicting cluster membership")
            return self.model.predict(h2o_dataframe)["predict"].as_data_frame(use_pandas=False, header=False)
        except H2OConnectionError:
            logging.error("unable to connect to H2O server @ {0}".format(server_url))
            raise ConnectionError("unable to connect to H2O server. check if server is running at specified URL")
Beispiel #20
0
  recognize bugs in h2o.init() for this mode of operation.
  For this ticket, I think we should create a set of tests that check that h2o.init() is successful for each
  OS/client interface combination.

  Below is the test that will be implemented:
"""
from __future__ import print_function
import sys
sys.path.insert(0, "../..")
import h2o

PORT = 55330

# Check whether there is already an instance running at the specified port, and if so shut it down.
try:
    conn = h2o.connect(port=PORT)
    conn.shutdown(prompt=False)
except h2o.H2OConnectionError:
    pass


# Now start a new H2O server and connect to it.
server = h2o.start(port=str(PORT) + "+")
conn = h2o.connect(server=server)

# Get if cluster is up (True) or not (False)
cluster_up = conn.cluster_is_up()

# Check if cluster is healthy
cluster_healthy = all(node["healthy"] for node in conn.info().nodes)
Beispiel #21
0
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator as gbm
from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm
from h2o.estimators.deeplearning import H2ODeepLearningEstimator as dlm

h2o.connect(ip="localhost", port="54535")

bank = h2o.import_file(
    path=
    "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/bank-full.csv.zip"
)
arr = h2o.import_file(
    path=
    "http://s3.amazonaws.com/h2o-public-test-data/smalldata/flow_examples/arrhythmia.csv.gz"
)

model = glm(family="binomial", model_id="first")
multi = glm(family="multinomial", model_id="multinom")
valid = glm(family="multinomial", model_id="valimon")
regr = glm(model_id="regress")
grad = gbm(model_id="gradi")

#bank data columns
#1 for multinomial
#16 for binomial
#5 for regression

model.train(y=16, x=range(16), training_frame=bank)
multi.train(y=1, x=[i for i in range(17) if i != 1], training_frame=bank)
model = glm(family="binomial", model_id="second")
model.train(y=16,
Beispiel #22
0
 def __default_h2o_connect(h2o_context, **kwargs):
     return h2o.connect(ip=h2o_context._client_ip,
                        port=h2o_context._client_port,
                        **kwargs)
Beispiel #23
0
import os
import h2o
import pandas as pd
import uuid
from .meta import questions
import copy

h2o.connect(ip="localhost")


class Coach:
    def __init__(self):

        self.model = h2o.load_model(os.environ.get(
            "COACH_MODEL"))  # import_mojo(os.environ.get("COACH_MODEL"))

        self._get_actionable_q()

    def get_model_predictors(self):
        var_imp = self.model._model_json['output'][
            'variable_importances'].as_data_frame()
        return var_imp["variable"]

    def _prepare_evaluation(self, df_input):
        all_predictors = self.get_model_predictors()

        for p in all_predictors:
            if not p in df_input:
                print("Warning: %s missing in inputVector" % p)
            else:
                try:
For this ticket, I think we should create a set of tests that check that h2o.init() is successful for each
OS/client interface combination.
"""
from __future__ import print_function
import sys
sys.path.insert(0, "../..")
import h2o
from h2o.backend import H2OLocalServer
from h2o.exceptions import H2OConnectionError

PORT = 55330

# Check whether there is already an instance running at the specified port, and if so shut it down.
try:
    conn = h2o.connect(ip="localhost", port=PORT)
    conn.cluster.shutdown(prompt=False)
except H2OConnectionError:
    pass

# The server takes some time to shut down, so try different ports
print("Start a server with max_mem_size = 1Gb")
h2o.init(max_mem_size="1g", port=10101, strict_version_check=False)
h2o.cluster().shutdown()

print("Starting a server with min_mem_size = 314Mb")
h2o.init(min_mem_size="314M", port=20202, strict_version_check=False)
h2o.cluster().shutdown()

print("Starting a server explicitly")
# Now start a new H2O server and connect to it.
Beispiel #25
0
    def test_grid_reload(self):
        name_node = utils.hadoop_namenode()
        work_dir = utils.get_workdir()
        dataset = "/datasets/iris_wheader.csv"

        ntrees_opts = [100, 120, 130, 140]
        learn_rate_opts = [0.01, 0.02, 0.03, 0.04]
        grid_size = len(ntrees_opts) * len(learn_rate_opts)
        print("max models %s" % grid_size)
        grid_id = "grid_ft_resume"
        hyper_parameters = {
            "learn_rate": learn_rate_opts,
            "ntrees": ntrees_opts
        }

        cluster_1_name = "grid1-py"
        recovery_dir_1 = work_dir + "/recovery1"
        try:
            cluster_1 = utils.start_cluster(cluster_1_name)
            h2o.connect(url=cluster_1)
            train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset))
            grid = H2OGridSearch(
                H2OGradientBoostingEstimator,
                grid_id=grid_id,
                hyper_params=hyper_parameters,
                recovery_dir=recovery_dir_1
            )
            print("starting initial grid and sleeping...")
            grid.start(x=list(range(4)), y=4, training_frame=train)
            grid_in_progress = None
            times_waited = 0
            while (times_waited < 20) and (grid_in_progress is None or len(grid_in_progress.model_ids) == 0):
                time.sleep(5)  # give it tome to train some models
                times_waited += 1
                try:
                    grid_in_progress = h2o.get_grid(grid_id)
                except IndexError:
                    print("no models trained yet")
            print("done sleeping")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_1_name)

        cluster_2_name = "grid2-py"
        recovery_dir_2 = work_dir + "/recovery2"
        try:
            cluster_2 = utils.start_cluster(cluster_2_name)
            h2o.connect(url=cluster_2)
            loaded = h2o.load_grid("%s/%s" % (recovery_dir_1, grid_id), load_params_references=True)
            print("models after first run:")
            for x in sorted(loaded.model_ids):
                print(x)
            loaded.resume(recovery_dir=recovery_dir_2)
            print("models after second run:")
            for x in sorted(loaded.model_ids):
                print(x)
            print("Newly grained grid has %d models" % len(loaded.model_ids))
            self.assertEqual(len(loaded.model_ids), grid_size, "The full grid was not trained.")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_2_name)
Beispiel #26
0
  recognize bugs in h2o.init() for this mode of operation.
  For this ticket, I think we should create a set of tests that check that h2o.init() is successful for each
  OS/client interface combination.

  Below is the test that will be implemented:
"""
from __future__ import print_function
import sys
sys.path.insert(0, "../..")
import h2o

PORT = 55330

# Check whether there is already an instance running at the specified port, and if so shut it down.
try:
    conn = h2o.connect(port=PORT)
    conn.shutdown(prompt=False)
except h2o.H2OConnectionError:
    pass

# Now start a new H2O server and connect to it.
server = h2o.start(port=str(PORT) + "+")
conn = h2o.connect(server=server)

# Get if cluster is up (True) or not (False)
cluster_up = conn.cluster_is_up()

# Check if cluster is healthy
cluster_healthy = all(node["healthy"] for node in conn.info().nodes)

# Logical test to see if status is healthy or not
    try:
        s = socket.socket()
        s.connect(("127.0.0.1", 54321))
        h2o_launched = True
    except Exception as e:
        time.sleep(6)
        if i % 5 == 0:
            print("Attempt {}: H2O-3 not running yet...".format(i))
        if i > 30:
            raise Exception("""Could not connect to H2O Cluster in {} attempts
                               Last Error: {}""".format(i, e))
        i += 1
    finally:
        s.close()

    h2o.connect(url="http://127.0.0.1:54321")


class ScoringService(object):
    model = None                # Where we keep the model when it's loaded

    @classmethod
    def get_model(cls):
        """Get the model object for this instance,
        loading it if it's not already loaded."""
        if cls.model is None:
            for file in os.listdir(model_path):
                # Assumes that 'AutoML' is somewhere in the filename of a
                # model that's been generated. We just load the first model
                # that satisfies this constraint, so caveat emptor if you've
                # run the 'train' script multiple times - this may still load
Beispiel #28
0
#! /usr/bin/python
import h2o
import sys
import os
import pandas as pd

cwd = os.getcwdu()
dataset = cwd + "/dataset/unsw-nb15_mereged.zip"
h2o.connect(ip="localhost", port="54321")
columns_types = [
    "enum", "numeric", "enum", "numeric", "enum", "enum", "numeric", "numeric",
    "numeric", "numeric", "numeric", "numeric", "numeric", "enum", "numeric",
    "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
    "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
    "numeric", "time", "time", "numeric", "numeric", "numeric", "numeric",
    "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
    "numeric", "numeric", "numeric", "numeric", "numeric", "numeric",
    "numeric", "enum", "enum"
]
data = h2o.import_file(
    path=dataset,
    destination_frame="unsw-nb15_mereged",
    col_types=columns_types,
)
response_column = "label"
predictor_columns = data.names
predictor_columns.remove(response_column)
predictor_columns.remove("srcip")
predictor_columns.remove("dstip")
predictor_columns.remove("attack_cat")
# training, test = data.split_frame(
Beispiel #29
0
import pandas as pd
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

con = h2o.connect(url='http://192.168.5.208:54321/')

csv_data = pd.read_csv('股票数据/处理后数据/processed_601857.csv', encoding='utf8')
csv_data['earn'] = csv_data['20_closing_price'] > csv_data['closing_price']*1.2
csv_data_ = h2o.H2OFrame(csv_data)
model = H2OGradientBoostingEstimator(model_id='stock_601857', nfolds=10,
distribution = "bernoulli", ntrees = 2000, max_depth = 10,
learn_rate = 0.4, histogram_type = "UniformAdaptive",
min_split_improvement = 0.000001,
balance_classes = False, seed = 52345,
stopping_rounds = 5, stopping_metric = 'AUC', stopping_tolerance = 0.001,
col_sample_rate = 0.6, col_sample_rate_per_tree = 0.6,
col_sample_rate_change_per_level = 0.6, sample_rate = 0.85, min_rows = 100,
)

traning_data, test_data = csv_data_.split_frame(ratios=[0.8], destination_frames=["train_frame", "test_data"])
csv_data.keys()
model.train(x=['closing_price', 'upping_ratio',
       'changing_ratio', 'volume', 'upping_ratio1',
       'upping_ratio2', 'upping_ratio3', 'upping_ratio4', 'upping_ratio5',
       'A_index_closing_price', 'A_index_upping_money', 'A_index_upping_ratio',
       'A_index_volume', 'A_index_volume_money', 'B_index_closing_price',
       'B_index_upping_money', 'B_index_upping_ratio', 'B_index_volume',
       'B_index_volume_money', 'top50_index_closing_price',
       'top50_index_upping_money', 'top50_index_upping_ratio',
       'top50_index_volume', 'top50_index_volume_money',
       'sh_index_closing_price', 'sh_index_upping_money',
  OS/client interface combination.

  Below is the test that will be implemented:
"""
from __future__ import print_function
import sys
sys.path.insert(0, "../..")
import h2o
from h2o.backend import H2OLocalServer
from h2o.exceptions import H2OConnectionError

PORT = 55330

# Check whether there is already an instance running at the specified port, and if so shut it down.
try:
    conn = h2o.connect(ip="localhost", port=PORT)
    conn.shutdown_server(prompt=False)
except H2OConnectionError:
    pass

# Now start a new H2O server and connect to it.
server = H2OLocalServer.start(port=str(PORT) + "+")
conn = h2o.connect(server=server)

# Get if cluster is up (True) or not (False)
cluster_up = conn.cluster_is_up()

# Check if cluster is healthy
cluster_healthy = all(node["healthy"] for node in conn.info().nodes)

# Logical test to see if status is healthy or not
Beispiel #31
0
def processAudio():
    import librosa
    import pandas as pd
    import numpy as np
    import librosa.display
    import parselmouth
    from parselmouth.praat import call
    from parselmouth import MFCC
    import matplotlib.pyplot as plt
    
    import h2o
    
    from h2o.grid.grid_search import H2OGridSearch
    import pandas as pd
    import numpy as np
    from sklearn import preprocessing
    from h2o.estimators.deeplearning import H2OAutoEncoderEstimator, H2ODeepLearningEstimator
    import os
    h2o.init(ip="127.0.0.1",max_mem_size_GB = 2)
    h2o.connect()
    f0min,f0max=70,600
    unit="Hertz"
    wave_file='Audio5780917.wav'
    y, sr = librosa.load(wave_file)
    time=librosa.get_duration(y=y, sr=sr)
    sound = parselmouth.Sound(wave_file)
    print("Processing {}...".format(wave_file))
    duration = call(sound, "Get total duration") # duration
    #ff0min, f0max=75,600 default
    pitch = call(sound, "To Pitch", 0.0, f0min, f0max) #create a praat pitch object
    pitchMean = call(pitch, "Get mean", 0, 0, unit) # get mean pitch
    PitchStdev = call(pitch, "Get standard deviation", 0 ,0, unit) # get standard deviation
  #  mfcc = call(sound , 'To MelSpectrogram...', 0, 0, 0.0001, 0.02, 1.3, 1.6)
    harmonicity = call(sound, "To Harmonicity (cc)", 0.01, f0min, 0.1, 1.0)
    hnr = call(harmonicity, "Get mean", 0, 0)
    pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max) 
    localJitter = call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    localabsoluteJitter = call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3)
    rapJitter = call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
    ppq5Jitter = call(pointProcess, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
    ddpJitter = call(pointProcess, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
    localShimmer =  call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    localdbShimmer = call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    apq3Shimmer = call([sound, pointProcess], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    aqpq5Shimmer = call([sound, pointProcess], "Get shimmer (apq5)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    apq11Shimmer =  call([sound, pointProcess], "Get shimmer (apq11)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    ddaShimmer = call([sound, pointProcess], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

    formants = call(sound, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50)
    numPoints = call(pointProcess, "Get number of points")
    f1_list = []
    f2_list = []
    f3_list = []
    f4_list = []
    
    # Measure formants only at glottal pulses
    for point in range(0, numPoints):
        point += 1
        t = call(pointProcess, "Get time from index", point)
        f1 = call(formants, "Get value at time", 1, t, 'Hertz', 'Linear')
        f2 = call(formants, "Get value at time", 2, t, 'Hertz', 'Linear')
        f3 = call(formants, "Get value at time", 3, t, 'Hertz', 'Linear')
        f4 = call(formants, "Get value at time", 4, t, 'Hertz', 'Linear')
        f1_list.append(f1)
        f2_list.append(f2)
        f3_list.append(f3)
        f4_list.append(f4)
    
    f1_list = [f1 for f1 in f1_list if str(f1) != 'nan']
    f2_list = [f2 for f2 in f2_list if str(f2) != 'nan']
    f3_list = [f3 for f3 in f3_list if str(f3) != 'nan']
    f4_list = [f4 for f4 in f4_list if str(f4) != 'nan']
    
    # calculate mean formants across pulses
    f1_mean = np.mean(f1_list)
    f2_mean = np.mean(f2_list)
    f3_mean = np.mean(f3_list)
    f4_mean = np.mean(f4_list)


    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    df = pd.DataFrame([[pitchMean ,PitchStdev , hnr ,  np.mean(chroma_stft) ,  np.mean(rmse)  , np.mean(spec_cent)  ,
                  np.mean(spec_bw)  , np.mean(rolloff) ,  np.round(localJitter,6)  , np.round(localabsoluteJitter,6)  ,
                  np.round(rapJitter,6)  , np.round(ppq5Jitter,6)  ,  np.round(ddpJitter,6)  , np.round(localShimmer,6)  ,
                  np.round(localdbShimmer,6)  , np.round(aqpq5Shimmer,6)  , np.round(apq11Shimmer,6)  , np.round(ddaShimmer,6) ,
                  f1_mean  , f2_mean ,  f3_mean  , f4_mean, mfcc[0].mean(),mfcc[1].mean(),mfcc[2].mean(), mfcc[3].mean(),
                  mfcc[4].mean(),mfcc[5].mean(),mfcc[6].mean(),mfcc[7].mean(),mfcc[8].mean(),mfcc[9].mean(),mfcc[10].mean(),
                  mfcc[11].mean(),mfcc[12].mean(),mfcc[13].mean(),mfcc[14].mean(),mfcc[15].mean(),mfcc[16].mean(),
                  mfcc[17].mean(),mfcc[18].mean(),mfcc[19].mean()]] ,
                  columns=['pitchMean' ,'pitchStdev', 'hnr', 'chroma_stft' ,'rmse' ,'spectral_centroid' ,
                 'spectral_bandwidth', 'rolloff', 'localJitter', 'localabsoluteJitter' ,'rapJitter', 'ppq5Jitter' ,'ddpJitter' ,
                 'localShimmer' ,'localdbShimmer' ,'aqpq5Shimmer' ,'apq11Shimmer', 'ddaShimmer' ,'formant1Mean' ,'formant2Mean' ,
                 'formant3Mean' ,'formant4Mean', 'mfcc1','mfcc2','mfcc3','mfcc4','mfcc5','mfcc6','mfcc7','mfcc8','mfcc9','mfcc10',
                 'mfcc11','mfcc12','mfcc13','mfcc14','mfcc15','mfcc16','mfcc17','mfcc18','mfcc19','mfcc20']) 
    df.fillna(0)
    hf = h2o.H2OFrame(df)
    #min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
    #X_scale = min_max_scaler.fit_transform(df)
    #saved_model = load_model('ANNModel.h5')
    saved_model = h2o.load_model('DNNH2OModel')
    testPerformance=saved_model.predict(hf)
    #testPerformance=saved_model.predict_classes(X_scale)
    prediction=testPerformance.as_data_frame()
    predict=prediction['predict'][0]
    if (predict == 'NoParkinson'):
        data={'df':'Congrats test is Negative'}
    if (predict == 'Parkinson'):
        data={'df':'Need to go for deep testing'}
    #predict=testPerformance[0]
    #tt=predict[0]
    #if(tt==1):
     #   data={'df':'Test Positve Go For Deep Testing'}
    #if(tt==0):
     #   data={'df':'Congrats Test is Negative'}
    data = jsonify(data)
    return data
Beispiel #32
0
def test():
    host_port = os.environ["cloud_ip_port_main"]
    main_uri = "%s/main" % host_port
    username = "******"
    password = "******"

    xgb_host_port = "%s/xgb" % os.environ["cloud_ip_port_xgb"]
    xgb_username = "******"
    xgb_password = "******"

    h2o.connect(url="http://%s" % main_uri, auth=(username, password))

    # hello test
    steam = MockSteam(main_uri, username, password)
    steam.send({"_type": "hello", "_id": "hi_1"})
    hello_response = steam.wait_for_message()
    assert hello_response is not None, "No hello response sent."
    assert "hello_response" == hello_response["_type"]
    assert "hi_1_response" == hello_response["_id"]

    # load data
    name_node = pyunit_utils.hadoop_namenode()
    train = h2o.import_file("hdfs://" + name_node +
                            "/datasets/chicagoCensus.csv")
    x = list(range(0, train.ncol - 1))
    y = train.ncol - 1
    train = train[~train[y].isna(), :]
    model1 = H2OXGBoostEstimator(ntrees=5)
    model2 = H2OXGBoostEstimator(ntrees=5)

    # make sure H2O thinks there is no cluster running
    steam.send(make_stop_req("stop_check"))
    stop_resp = steam.wait_for_message()
    assert stop_resp is not None, "No stop response"
    assert "stopXGBoostClusterConfirmation" == stop_resp["_type"]
    assert stop_resp["allowed"] is not None  # response could be anything here

    # steam does not respond
    model1.start(x=x, y=y, training_frame=train)
    start_req = steam.wait_for_message()
    assert start_req is not None, "No start request sent"
    assert "startXGBoostCluster" == start_req["_type"]
    time.sleep(20)
    try:
        model1.join()
        assert False, "Model train did not fail when steam did not respond"
    except Exception as e:
        print(e)
        assert True, "Jon failed as expected"

    # xgboost happy path
    model1.start(x=x, y=y, training_frame=train)
    start_req_1 = steam.wait_for_message()
    assert start_req_1 is not None, "No start request sent"
    assert "startXGBoostCluster" == start_req_1["_type"]
    steam.send(make_starting_response(start_req_1))
    time.sleep(10)
    steam.send(
        make_started_response(start_req_1, "started", xgb_host_port,
                              xgb_username, xgb_password))
    model1.join()
    steam.send(make_stop_req("stop_01"))
    stop_resp_1 = steam.wait_for_message()
    assert stop_resp_1 is not None, "No stop response"
    assert "stopXGBoostClusterConfirmation" == stop_resp_1["_type"]
    assert "true" == stop_resp_1["allowed"]

    # another train should trigger another cluster start
    model1.start(x=x, y=y, training_frame=train)
    start_req_2 = steam.wait_for_message()
    assert start_req_2 is not None, "No start request sent"
    assert "startXGBoostCluster" == start_req_2["_type"]
    steam.send(
        make_started_response(start_req_2, "started", xgb_host_port,
                              xgb_username, xgb_password))
    model1.join()
    model2.start(x=x, y=y, training_frame=train)
    assert steam.wait_for_message(
    ) is None, "Should not sent start request for another job"
    model2.join()
    steam.send(make_stop_req("stop_02"))
    stop_resp_2 = steam.wait_for_message()
    assert stop_resp_2 is not None, "No stop response"
    assert "stopXGBoostClusterConfirmation" == stop_resp_2["_type"]
    assert "true" == stop_resp_2["allowed"]

    # starting of cluster fails
    model1.start(x=x, y=y, training_frame=train)
    start_req_3 = steam.wait_for_message()
    assert start_req_3 is not None, "No start request sent"
    assert "startXGBoostCluster" == start_req_3["_type"]
    steam.send(
        make_started_response(start_req_3,
                              "failed",
                              xgb_host_port,
                              xgb_username,
                              xgb_password,
                              reason="Testing, testing"))
    try:
        model1.join()
        assert False, "Model train did not fail when steam responded with failure"
    except Exception as e:
        print(e)
        assert True, "Jon failed as expected"

    # cleanup
    steam.close()
Beispiel #33
0
    thresholdAllowed = 0.05
    while fn/(tn+fn) > thresholdAllowed:
        confMtx = m.confusion_matrix(xval=True, thresholds=[f1Threshold]).to_list()
        fn = confMtx[1][0]
        tn = confMtx[1][1]
        #f1Threshold -= (f1Threshold * 0.1 + 0.0001)
        f1Threshold *= 0.9
        if f1Threshold <= 0.00002:
            break
#    print(f1Threshold, round(fn/(fn+tn)*100, 2), '% false negatives')
#    print(m.confusion_matrix(xval=True, thresholds=[f1Threshold]))
    bRate = str(round(confMtx[0][1] / (confMtx[0][0] + confMtx[0][1]) * 100, 1))
    mRate = str(round(confMtx[1][0] / (confMtx[1][0] + confMtx[1][1]) * 100, 1))
    return "At threshold of " + str(round(f1Threshold, 5))+", mRate of " + mRate + "%, bRate of " + bRate+"%"

h2o.connect(ip='xcnd14.comp.nus.edu.sg')

accuracy = []
for i in range(56):
    m = h2o.get_model('grid-dl_model_'+str(i))
    accuracy.append(getAccuracy(m))
#    print(round(m.auc(xval=True), 3), round(m.logloss(xval = True),3), round(m.accuracy(xval=True)[0][1],4))
#    print(i, m.logloss(xval = True), m.auc(xval = True))

for i in range(len(accuracy)):
    print(i, accuracy[i])
    
accuracy = []
for i in [52, 40, 36,24,48,54,44,28,20,53]:
    accuracy.append(getAccuracy(h2o.get_model('grid-dl_model_'+str(i))))
    def test_auto_recovery(self):
        name_node = pyunit_utils.hadoop_namenode()
        dataset = "/datasets/iris_wheader.csv"

        ntrees_opts = [100, 120, 130, 140]
        learn_rate_opts = [0.01, 0.02, 0.03, 0.04]
        grid_size = len(ntrees_opts) * len(learn_rate_opts)
        print("max models %s" % grid_size)
        grid_id = "grid_ft_auto_recover"
        hyper_parameters = {
            "learn_rate": learn_rate_opts,
            "ntrees": ntrees_opts
        }

        cluster_1_name = "grid-auto-1-py"
        try:
            cluster_1 = utils.start_cluster(cluster_1_name,
                                            enable_auto_recovery=True,
                                            clean_auto_recovery=True)
            print("initial cluster started at %s" % cluster_1)
            h2o.connect(url=cluster_1)
            train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset))
            grid = H2OGridSearch(H2OGradientBoostingEstimator,
                                 grid_id=grid_id,
                                 hyper_params=hyper_parameters)
            bg_train_thread = threading.Thread(target=self._training_thread,
                                               kwargs={
                                                   "grid": grid,
                                                   "train": train
                                               })
            bg_train_thread.start()
            phase_1_models = self._wait_for_model_to_build(grid_id)
            self._print_models("Initial models", phase_1_models)
            assert len(phase_1_models) > 0
            self._check_training_error()
        finally:
            utils.stop_cluster(cluster_1_name)

        cluster_2_name = "grid-auto-2-py"
        try:
            cluster_2 = utils.start_cluster(cluster_2_name,
                                            enable_auto_recovery=True)
            print("cluster resumed at %s, should unblock background thread" %
                  cluster_2)
            phase_2_models = self._wait_for_model_to_build(
                grid_id,
                len(phase_1_models) + 1)
            self._print_models("Recovery #1 models", phase_2_models)
            assert len(phase_2_models) > len(phase_1_models)
            self._check_training_error()
        finally:
            utils.stop_cluster(cluster_2_name)

        cluster_3_name = "grid-auto-3-py"
        try:
            cluster_3 = utils.start_cluster(cluster_3_name,
                                            enable_auto_recovery=True)
            print("cluster resumed at %s, waiting for training to finish" %
                  cluster_3)
            bg_train_thread.join()
            print("models after final run:")
            for x in sorted(grid.model_ids):
                print(x)
            print("Finished grained grid has %d models" % len(grid.model_ids))
            self.assertEqual(len(grid.model_ids), grid_size,
                             "The full grid was not trained.")
            self._check_training_error()
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_3_name)
Beispiel #35
0
# ----common init----
import h2o
import sys
import time
import socket
ip = socket.gethostbyname(socket.gethostname())
# ----1M CSV ----
h2o.connect(ip=sys.argv[1], port=sys.argv[2])
start1Mcsv = time.time()
ds1Mcsv = h2o.import_file("hdfs://" + ip + "/user/hadoop/1M/data1M.csv")
end1Mcsv = time.time()
print '================results for 1M rows==============='
print 'HDFS: the import of 1M rows from CSV/HDFS took', end1Mcsv - start1Mcsv, 's'
print '================results for 1M rows==============='
Beispiel #36
0
datasets = "Audit,Auto,Housing,Iris,Sentiment,Versicolor,Wheat"

with_h2o = False

if __name__ == "__main__":
	if len(sys.argv) > 1:
		datasets = sys.argv[1]
	if len(sys.argv) > 2:
		with_h2o = "H2O" in sys.argv[2]

datasets = datasets.split(",")

if with_h2o:
	h2o.init()
	h2o.connect()

#
# Clustering
#

wheat_X, wheat_y = load_wheat("Wheat")

def kmeans_distance(kmeans, center, X):
	return numpy.sum(numpy.power(kmeans.cluster_centers_[center] - X, 2), axis = 1)

def build_wheat(kmeans, name, with_affinity = True, **pmml_options):
	mapper = DataFrameMapper([
		(wheat_X.columns.values, [ContinuousDomain(dtype = float), IdentityTransformer()])
	])
	scaler = ColumnTransformer([
                                             distribution="multinomial",
                                             seed=1234)
    air_model.train(x=myX, y=targetColumnName,
                    training_frame=encodedTrain, validation_frame=encodedValid)
    variable_importance = air_model._model_json['output']['variable_importances'].as_data_frame()
    # print(variable_importance)

    my_gbm_metrics = air_model.model_performance(encodedTest)
    auc = my_gbm_metrics.auc()
    sum_of_aucs += auc
    print("AUC with none(holdout) for seed: " + str(current_seed) + " = " + str(auc))
  return sum_of_aucs / len(seeds)


if __name__ == "__main__":
  h2o.connect()

  titanic = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1)

  runs = 1 # Set to a bigger value to get more objective resuts.

  seeds = random.sample(range(1, 10000), runs)

  without_te = titanic_without_te(titanic, seeds)
  kfold_strategy = titanic_with_te_kfoldstrategy(titanic, seeds)
  loo_strategy = titanic_with_te_loostrategy(titanic, seeds)
  none_strategy = titanic_with_te_nonestrategy(titanic, seeds)

  print("\n\nReport was generated based on average values from " + str(runs) + " runs that depends on the same set of seeds")
  print("AUC without te: " + str(without_te))
  print("AUC with kfold: " + str(kfold_strategy))