def __default_h2o_connect(h2o_context, **kwargs): if "https" in kwargs: warnings.warn("https argument is automatically set up and the specified value will be ignored.") schema = h2o_context._jhc.h2oContext().getScheme(h2o_context._jhc.h2oContext()._conf()) kwargs["https"] = False if schema == "https": kwargs["https"] = True if h2o_context._conf.context_path() is not None: url = "{}://{}:{}/{}".format(schema, h2o_context._client_ip, h2o_context._client_port, h2o_context._conf.context_path()) return h2o.connect(url=url, **kwargs) else: return h2o.connect(ip=h2o_context._client_ip, port=h2o_context._client_port, **kwargs)
def start(): """Starts h2o session if it is not running in the server. If it already is running, then connect to it. """ try: print("[INFO] Checking if H2O is already running in the server.") h2o.connect() except: print("[INFO] H2O is not running. Starting H2O now.") os.system('../bin/deploy_h2o.sh') time.sleep(3) h2o.connect() return h2o
def getOrCreate(spark, conf=None, **kwargs): """ Get existing or create new H2OContext based on provided H2O configuration. If the conf parameter is set then configuration from it is used. Otherwise the configuration properties passed to Sparkling Water are used. If the values are not found the default values are used in most of the cases. The default cluster mode is internal, ie. spark.ext.h2o.external.cluster.mode=false param - Spark Context or Spark Session returns H2O Context """ spark_session = spark if isinstance(spark, SparkContext): warnings.warn( "Method H2OContext.getOrCreate with argument of type SparkContext is deprecated and " + "parameter of type SparkSession is preferred.") spark_session = SparkSession.builder.getOrCreate() h2o_context = H2OContext(spark_session) jvm = h2o_context._jvm # JVM jspark_session = h2o_context._jspark_session # Java Spark Session if conf is not None: selected_conf = conf else: selected_conf = H2OConf(spark_session) # Create backing Java H2OContext jhc = jvm.org.apache.spark.h2o.JavaH2OContext.getOrCreate( jspark_session, selected_conf._jconf) h2o_context._jhc = jhc h2o_context._conf = selected_conf h2o_context._client_ip = jhc.h2oLocalClientIp() h2o_context._client_port = jhc.h2oLocalClientPort() # Create H2O REST API client h2o.connect(ip=h2o_context._client_ip, port=h2o_context._client_port, **kwargs) h2o_context.is_initialized = True print(h2o_context) # Stop h2o when running standalone pysparkling scripts, only in client deploy mode #, so the user does not explicitly close h2o. # In driver mode the application would call exit which is handled by Spark AM as failure deploy_mode = spark_session.sparkContext._conf.get( "spark.submit.deployMode") if deploy_mode != "cluster": atexit.register(lambda: h2o_context.stop_with_jvm()) return h2o_context
def __init__(self, ip: str = '', port: str = '', settings_file_name: str = 'settings.ini'): """Init.""" self.config = configparser.ConfigParser() self.config.read(settings_file_name, encoding='utf-8') for key, value in self.config['MAIN'].items(): setattr(self, key, value) h2o.connect(ip=ip, port=port, auth=(self.login, self.password), verbose=False) h2o.no_progress()
def h2o_test_setup(sys_args): h2o_py_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..")) h2o_docs_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","h2o-docs")) parse_args(sys_args) sys.path.insert(1, h2o_py_dir) import h2o from tests import pyunit_utils, pydemo_utils, pybooklet_utils for pkg in (pyunit_utils, pybooklet_utils): setattr(pkg, '__on_hadoop__', _ON_HADOOP_) setattr(pkg, '__hadoop_namenode__', _HADOOP_NAMENODE_) setattr(pkg, '__test_name__', _TEST_NAME_) setattr(pkg, '__results_dir__', _RESULTS_DIR_) if _IS_PYUNIT_ or _IS_IPYNB_ or _IS_PYBOOKLET_ or _IS_PYDEMO_: pass else: raise(EnvironmentError, "Unrecognized test type. Must be of type ipynb, pydemo, pyunit, or pybooklet, but got: " "{0}".format(_TEST_NAME_)) print("[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Connect to h2o on IP: {0} PORT: {1}".format(_H2O_IP_, _H2O_PORT_))) auth = None if _LDAP_USER_NAME_ is not None and _LDAP_PASSWORD_ is not None: auth = (_LDAP_USER_NAME_, _LDAP_PASSWORD_) elif _KERB_PRINCIPAL_ is not None: from h2o.auth import SpnegoAuth auth = SpnegoAuth(service_principal=_KERB_PRINCIPAL_) h2o.connect(ip=_H2O_IP_, port=_H2O_PORT_, verbose=False, auth=auth, **_H2O_EXTRA_CONNECT_ARGS_) h2o.utils.config.H2OConfigReader.get_config()["general.allow_breaking_changes"] = True #rest_log = os.path.join(_RESULTS_DIR_, "rest.log") #h2o.start_logging(rest_log) #print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Started rest logging in: {0}".format(rest_log)) h2o.log_and_echo("------------------------------------------------------------") h2o.log_and_echo("") h2o.log_and_echo("STARTING TEST: " + _TEST_NAME_) h2o.log_and_echo("") h2o.log_and_echo("------------------------------------------------------------") h2o.remove_all() if _IS_IPYNB_: pydemo_utils.ipy_notebook_exec(_TEST_NAME_) elif _IS_PYUNIT_: pyunit_utils.pyunit_exec(_TEST_NAME_) elif _IS_PYBOOKLET_: pybooklet_utils.pybooklet_exec(_TEST_NAME_) elif _IS_PYDEMO_: pydemo_utils.pydemo_exec(_TEST_NAME_)
def __default_h2o_connect(h2o_context, **kwargs): schema = h2o_context._jhc.h2oContext().getScheme( h2o_context._jhc.h2oContext()._conf()) https = False if schema == "https": https = True if h2o_context._conf.context_path() is not None: url = "{}://{}:{}/{}".format(schema, h2o_context._client_ip, h2o_context._client_port, h2o_context._conf.context_path()) return h2o.connect(url=url, https=https, **kwargs) else: return h2o.connect(ip=h2o_context._client_ip, port=h2o_context._client_port, https=https, **kwargs)
def predict_churn(State, AccountLength, AreaCode, Phone, IntlPlan, VMailPlan, VMailMessage, DayMins, DayCalls, DayCharge, EveMins, EveCalls, EveCharge, NightMins, NightCalls, NightCharge, IntlMins, IntlCalls, IntlCharge, CustServCalls): # connect to the model scoring service h2o.connect() # open the downloaded model ChurnPredictor = h2o.load_model(path='AutoML-leader') # define a feature vector to evaluate with the model newData = pd.DataFrame( { 'State': State, 'Account Length': AccountLength, 'Area Code': AreaCode, 'Phone': Phone, 'Int\'l Plan': IntlPlan, 'VMail Plan': VMailPlan, 'VMail Message': VMailMessage, 'Day Mins': DayMins, 'Day Calls': DayCalls, 'Day Charge': DayCharge, 'Eve Mins': EveMins, 'Eve Calls': EveCalls, 'Eve Charge': EveCharge, 'Night Mins': NightMins, 'Night Calls': NightCalls, 'Night Charge': NightCharge, 'Intl Mins': IntlMins, 'Intl Calls': IntlCalls, 'Intl Charge': IntlCharge, 'CustServ Calls': CustServCalls }, index=[0]) # evaluate the feature vector using the model predictions = ChurnPredictor.predict(h2o.H2OFrame(newData)) predictionsOut = h2o.as_list(predictions, use_pandas=False) prediction = predictionsOut[1][0] probabilityChurn = predictionsOut[1][1] probabilityRetain = predictionsOut[1][2] return "Prediction: " + str(prediction) + " |Probability to Churn: " + str( probabilityChurn) + " |Probability to Retain: " + str( probabilityRetain)
def grid_search(training_df, attribute_property_length): h2o.init() h2o.connect() training_array = training_df.values x = training_array[:, 0:attribute_property_length] y = training_array[:, attribute_property_length - 1] tr_df = h2o.H2OFrame(x) training_columns = [ 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10' ] response_column = 'C11' hyper_parameters = {'ntrees': [15, 20, 25], 'max_depth': [15, 20]} random_plus_manual = H2OGridSearch( H2ORandomForestEstimator(nfolds=n_splits), hyper_parameters) random_plus_manual.train(x=training_columns, y=response_column, training_frame=tr_df) random_plus_manual.show()
def __default_h2o_connect(h2o_context, **kwargs): if "https" in kwargs: warnings.warn( "https argument is automatically set up and the specified value will be ignored." ) schema = h2o_context._jhc.h2oContext().getConf().getScheme() kwargs["https"] = False if schema == "https": kwargs["https"] = True if h2o_context._conf.context_path() is not None: url = "{}://{}:{}/{}".format(schema, h2o_context._client_ip, h2o_context._client_port, h2o_context._conf.context_path()) return h2o.connect(url=url, **kwargs) else: return h2o.connect(ip=h2o_context._client_ip, port=h2o_context._client_port, **kwargs)
def churn_predict_batch(batchFile): # connect to the model scoring service h2o.connect(verbose=False) # load the user-specified file newData = h2o.import_file(batchFile) # open the downloaded model ChurnPredictor = h2o.load_model(path=downloaded_model) # evaluate the feature vector using the model predictions = ChurnPredictor.predict(newData) predictions = newData.cbind(predictions) h2o.download_csv(predictions, 'predictions.csv') upload_file_to_s3('predictions.csv') successMessage2 = "Predictions saved https://s3-us-west-1.amazonaws.com/dsclouddata/home/jupyter/predictions.csv" return successMessage2
def get_h2o_predictions(training_dataset, testing_dataset, attribute_property_length): training_df = RandomForest.get_property_data_frame( training_dataset, one_hot_encoding=False) testing_df = RandomForest.get_property_data_frame( testing_dataset, one_hot_encoding=False) h2o.init() h2o.connect() training_array = training_df.values testing_array = testing_df.values x = training_array[:, 0:attribute_property_length] y = training_array[:, attribute_property_length - 1] x_test = testing_array[:, 0:attribute_property_length] ts_df = h2o.H2OFrame(x_test) tr_df = h2o.H2OFrame(x) training_columns = [ 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22' ] response_column = 'C23' model = H2ORandomForestEstimator(ntrees=num_trees, max_depth=max_depth, nfolds=n_splits) model.train(x=training_columns, y=response_column, training_frame=tr_df, validation_frame=ts_df) predictions = model.predict(ts_df) model.show() print(model.varimp(True)) predictions_array = [] probabilities_array = [] predictions = predictions.as_data_frame().values.tolist() for i in range(len(predictions)): if predictions[i][0] >= 0.5: predictions_array.append(1.0) else: predictions_array.append(0.0) probabilities_array.append(predictions[i][0]) return predictions_array, probabilities_array
def h2oconnect(): """ Python API test: h2o.connect(server=None, url=None, ip=None, port=None, https=None, verify_ssl_certificates=None, auth=None, proxy=None,cookies=None, verbose=True) """ ipA = "127.0.0.1" portN = "54321" urlS = "http://127.0.0.1:54321" try: connect_type=h2o.connect(ip = ipA, port = portN, verbose = True) assert_is_type(connect_type, H2OConnection) except Exception as e: # port number may not match. Make sure the right error message is returned assert 'Could not establish link' in e.args[0], "h2o.connect command is not working." try: connect_type2 = h2o.connect(url=urlS, https=True, verbose = True) # pass if no connection issue assert_is_type(connect_type2, H2OConnection) except Exception as e: # port number may not match. Make sure the right error message is returned assert 'Could not establish link' in e.args[0], "h2o.connect command is not working."
def test_cacert_in_config(): cfg = { "ip": "self-signed.badssl.com", "port": 443, "verify_ssl_certificates": True, "https": True } try: h2o.connect(config=cfg) assert False except H2OConnectionError as e: assert "CERTIFICATE_VERIFY_FAILED" in str(e) cfg["cacert"] = pyunit_utils.locate( "smalldata/certs/badssl-cacert-2020.pem") try: h2o.connect(config=cfg) assert False except H2OConnectionError as e: # any response is a good response - TLS handshake was successful which proves the certificate was used assert "HTTP 404 Not Found" in str(e)
def approve_loan(Loan_Amount, Term, Interest_Rate, Employment_Years, Home_Ownership, Annual_Income, Verification_Status, Loan_Purpose, State, Debt_to_Income, Delinquent_2yr, Revolving_Cr_Util, Total_Accounts, Longest_Credit_Length): # connect to the model scoring service h2o.connect() # open the downloaded model ChurnPredictor = h2o.load_model(path='DRF_model_1496459915419_4') # define a feature vector to evaluate with the model newData = pd.DataFrame( { 'Loan_Amount': Loan_Amount, 'Term': Term, 'Interest_Rate': Interest_Rate, 'Employment_Years': Employment_Years, 'Home_Ownership': Home_Ownership, 'Annual_Income': Annual_Income, 'Verification_Status': Verification_Status, 'Loan_Purpose': Loan_Purpose, 'State': State, 'Debt_to_Income': Debt_to_Income, 'Delinquent_2yr': Delinquent_2yr, 'Revolving_Cr_Util': Revolving_Cr_Util, 'Total_Accounts': Total_Accounts, 'Longest_Credit_Length': Longest_Credit_Length }, index=[0]) # evaluate the feature vector using the model predictions = ChurnPredictor.predict(h2o.H2OFrame(newData)) predictionsOut = h2o.as_list(predictions, use_pandas=False) prediction = predictionsOut[1][0] probabilityBad = predictionsOut[1][1] probabilityGood = predictionsOut[1][2] return "Prediction: " + str( prediction) + " |Probability of Bad Loan: " + str( probabilityBad) + " |Probability of Good Loan: " + str( probabilityGood)
def h2o_test_setup(sys_args): h2o_py_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..")) h2o_docs_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","h2o-docs")) parse_args(sys_args) sys.path.insert(1, h2o_py_dir) import h2o from tests import pyunit_utils, pydemo_utils, pybooklet_utils set_pyunit_pkg_attrs(pyunit_utils) set_pybooklet_pkg_attrs(pybooklet_utils) if _IS_PYUNIT_ or _IS_IPYNB_ or _IS_PYBOOKLET_ or _IS_PYDEMO_: pass else: raise(EnvironmentError, "Unrecognized test type. Must be of type ipynb, pydemo, pyunit, or pybooklet, but got: " "{0}".format(_TEST_NAME_)) print("[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Connect to h2o on IP: {0} PORT: {1}".format(_H2O_IP_, _H2O_PORT_))) h2o.connect(ip=_H2O_IP_, port=_H2O_PORT_, verbose=False) h2o.utils.config.H2OConfigReader.get_config()["general.allow_breaking_changes"] = True #rest_log = os.path.join(_RESULTS_DIR_, "rest.log") #h2o.start_logging(rest_log) #print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Started rest logging in: {0}".format(rest_log)) h2o.log_and_echo("------------------------------------------------------------") h2o.log_and_echo("") h2o.log_and_echo("STARTING TEST: " + _TEST_NAME_) h2o.log_and_echo("") h2o.log_and_echo("------------------------------------------------------------") h2o.remove_all() if _IS_IPYNB_: pydemo_utils.ipy_notebook_exec(_TEST_NAME_) elif _IS_PYUNIT_: pyunit_utils.pyunit_exec(_TEST_NAME_) elif _IS_PYBOOKLET_: pybooklet_utils.pybooklet_exec(_TEST_NAME_) elif _IS_PYDEMO_: pydemo_utils.pydemo_exec(_TEST_NAME_)
def getOrCreate(spark, conf=None, **kwargs): """ Get existing or create new H2OContext based on provided H2O configuration. If the conf parameter is set then configuration from it is used. Otherwise the configuration properties passed to Sparkling Water are used. If the values are not found the default values are used in most of the cases. The default cluster mode is internal, ie. spark.ext.h2o.external.cluster.mode=false param - Spark Context or Spark Session returns H2O Context """ spark_session = spark if isinstance(spark, SparkContext): warnings.warn("Method H2OContext.getOrCreate with argument of type SparkContext is deprecated and " + "parameter of type SparkSession is preferred.") spark_session = SparkSession.builder.getOrCreate() h2o_context = H2OContext(spark_session) jvm = h2o_context._jvm # JVM jsc = h2o_context._jsc # JavaSparkContext if conf is not None: selected_conf = conf else: selected_conf = H2OConf(spark_session) # Create backing Java H2OContext jhc = jvm.org.apache.spark.h2o.JavaH2OContext.getOrCreate(jsc, selected_conf._jconf) h2o_context._jhc = jhc h2o_context._conf = selected_conf h2o_context._client_ip = jhc.h2oLocalClientIp() h2o_context._client_port = jhc.h2oLocalClientPort() # Create H2O REST API client h2o.connect(ip=h2o_context._client_ip, port=h2o_context._client_port, **kwargs) h2o_context.is_initialized = True # Stop h2o when running standalone pysparkling scripts and the user does not explicitly close h2o atexit.register(lambda: h2o_context.stop_with_jvm()) return h2o_context
def __h2o_connect(h2o_context): schema = h2o_context._jhc.getConf().getScheme() conf = h2o_context._conf kwargs = {} kwargs["https"] = schema == "https" kwargs["verify_ssl_certificates"] = conf.verifySslCertificates() if conf.userName() and conf.password(): kwargs["auth"] = (conf.userName(), conf.password()) url = "{}://{}:{}".format(schema, h2o_context._client_ip, h2o_context._client_port) if conf.contextPath() is not None: url = "{}/{}".format(url, conf.contextPath()) return h2o.connect(url=url, **kwargs)
def h2oconnect(): """ Python API test: h2o.connect(server=None, url=None, ip=None, port=None, https=None, verify_ssl_certificates=None, auth=None, proxy=None, cluster_id=None, cookies=None, verbose=True) """ ipA = "127.0.0.1" portN = "54321" urlS = "http://127.0.0.1:54321" try: connect_type = h2o.connect(ip=ipA, port=portN, verbose=True) assert_is_type(connect_type, H2OConnection) except Exception as e: # port number may not match. Make sure the right error message is returned assert 'Could not establish link' in e.args[ 0], "h2o.connect command is not working." try: connect_type2 = h2o.connect( url=urlS, https=True, verbose=True) # pass if no connection issue assert_is_type(connect_type2, H2OConnection) except Exception as e: # port number may not match. Make sure the right error message is returned assert 'Could not establish link' in e.args[ 0], "h2o.connect command is not working."
def do_h2o_kmeans(self, dataset, server_url): """use the h2o module to perform k-means clustering. This method delegates clustering to a H2O server instance(local or remote). A connection attempt will be made to the provided server_url before clustering is initiated. input: :param dataset: input data - term document matrix :param server_url: URL of the H2O server instance on which clustering would run output: labels_: a list of cluster identifiers - 1 per input document :raises ConnectionError""" # establish connection to H20 server try: h2o.connect(url=server_url, verbose=False) logging.info("connected to H2O server") h2o_dataframe = h2o.H2OFrame(python_obj=dataset) self.model = H2OKMeansEstimator(max_iterations=self.config.NITER, k=self.config.NCLUSTERS, init="PlusPlus", standardize=False) self.model.train(training_frame=h2o_dataframe) logging.info("modelling complete. predicting cluster membership") return self.model.predict(h2o_dataframe)["predict"].as_data_frame(use_pandas=False, header=False) except H2OConnectionError: logging.error("unable to connect to H2O server @ {0}".format(server_url)) raise ConnectionError("unable to connect to H2O server. check if server is running at specified URL")
recognize bugs in h2o.init() for this mode of operation. For this ticket, I think we should create a set of tests that check that h2o.init() is successful for each OS/client interface combination. Below is the test that will be implemented: """ from __future__ import print_function import sys sys.path.insert(0, "../..") import h2o PORT = 55330 # Check whether there is already an instance running at the specified port, and if so shut it down. try: conn = h2o.connect(port=PORT) conn.shutdown(prompt=False) except h2o.H2OConnectionError: pass # Now start a new H2O server and connect to it. server = h2o.start(port=str(PORT) + "+") conn = h2o.connect(server=server) # Get if cluster is up (True) or not (False) cluster_up = conn.cluster_is_up() # Check if cluster is healthy cluster_healthy = all(node["healthy"] for node in conn.info().nodes)
import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator as gbm from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glm from h2o.estimators.deeplearning import H2ODeepLearningEstimator as dlm h2o.connect(ip="localhost", port="54535") bank = h2o.import_file( path= "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/bank-full.csv.zip" ) arr = h2o.import_file( path= "http://s3.amazonaws.com/h2o-public-test-data/smalldata/flow_examples/arrhythmia.csv.gz" ) model = glm(family="binomial", model_id="first") multi = glm(family="multinomial", model_id="multinom") valid = glm(family="multinomial", model_id="valimon") regr = glm(model_id="regress") grad = gbm(model_id="gradi") #bank data columns #1 for multinomial #16 for binomial #5 for regression model.train(y=16, x=range(16), training_frame=bank) multi.train(y=1, x=[i for i in range(17) if i != 1], training_frame=bank) model = glm(family="binomial", model_id="second") model.train(y=16,
def __default_h2o_connect(h2o_context, **kwargs): return h2o.connect(ip=h2o_context._client_ip, port=h2o_context._client_port, **kwargs)
import os import h2o import pandas as pd import uuid from .meta import questions import copy h2o.connect(ip="localhost") class Coach: def __init__(self): self.model = h2o.load_model(os.environ.get( "COACH_MODEL")) # import_mojo(os.environ.get("COACH_MODEL")) self._get_actionable_q() def get_model_predictors(self): var_imp = self.model._model_json['output'][ 'variable_importances'].as_data_frame() return var_imp["variable"] def _prepare_evaluation(self, df_input): all_predictors = self.get_model_predictors() for p in all_predictors: if not p in df_input: print("Warning: %s missing in inputVector" % p) else: try:
For this ticket, I think we should create a set of tests that check that h2o.init() is successful for each OS/client interface combination. """ from __future__ import print_function import sys sys.path.insert(0, "../..") import h2o from h2o.backend import H2OLocalServer from h2o.exceptions import H2OConnectionError PORT = 55330 # Check whether there is already an instance running at the specified port, and if so shut it down. try: conn = h2o.connect(ip="localhost", port=PORT) conn.cluster.shutdown(prompt=False) except H2OConnectionError: pass # The server takes some time to shut down, so try different ports print("Start a server with max_mem_size = 1Gb") h2o.init(max_mem_size="1g", port=10101, strict_version_check=False) h2o.cluster().shutdown() print("Starting a server with min_mem_size = 314Mb") h2o.init(min_mem_size="314M", port=20202, strict_version_check=False) h2o.cluster().shutdown() print("Starting a server explicitly") # Now start a new H2O server and connect to it.
def test_grid_reload(self): name_node = utils.hadoop_namenode() work_dir = utils.get_workdir() dataset = "/datasets/iris_wheader.csv" ntrees_opts = [100, 120, 130, 140] learn_rate_opts = [0.01, 0.02, 0.03, 0.04] grid_size = len(ntrees_opts) * len(learn_rate_opts) print("max models %s" % grid_size) grid_id = "grid_ft_resume" hyper_parameters = { "learn_rate": learn_rate_opts, "ntrees": ntrees_opts } cluster_1_name = "grid1-py" recovery_dir_1 = work_dir + "/recovery1" try: cluster_1 = utils.start_cluster(cluster_1_name) h2o.connect(url=cluster_1) train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset)) grid = H2OGridSearch( H2OGradientBoostingEstimator, grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=recovery_dir_1 ) print("starting initial grid and sleeping...") grid.start(x=list(range(4)), y=4, training_frame=train) grid_in_progress = None times_waited = 0 while (times_waited < 20) and (grid_in_progress is None or len(grid_in_progress.model_ids) == 0): time.sleep(5) # give it tome to train some models times_waited += 1 try: grid_in_progress = h2o.get_grid(grid_id) except IndexError: print("no models trained yet") print("done sleeping") h2o.connection().close() finally: utils.stop_cluster(cluster_1_name) cluster_2_name = "grid2-py" recovery_dir_2 = work_dir + "/recovery2" try: cluster_2 = utils.start_cluster(cluster_2_name) h2o.connect(url=cluster_2) loaded = h2o.load_grid("%s/%s" % (recovery_dir_1, grid_id), load_params_references=True) print("models after first run:") for x in sorted(loaded.model_ids): print(x) loaded.resume(recovery_dir=recovery_dir_2) print("models after second run:") for x in sorted(loaded.model_ids): print(x) print("Newly grained grid has %d models" % len(loaded.model_ids)) self.assertEqual(len(loaded.model_ids), grid_size, "The full grid was not trained.") h2o.connection().close() finally: utils.stop_cluster(cluster_2_name)
recognize bugs in h2o.init() for this mode of operation. For this ticket, I think we should create a set of tests that check that h2o.init() is successful for each OS/client interface combination. Below is the test that will be implemented: """ from __future__ import print_function import sys sys.path.insert(0, "../..") import h2o PORT = 55330 # Check whether there is already an instance running at the specified port, and if so shut it down. try: conn = h2o.connect(port=PORT) conn.shutdown(prompt=False) except h2o.H2OConnectionError: pass # Now start a new H2O server and connect to it. server = h2o.start(port=str(PORT) + "+") conn = h2o.connect(server=server) # Get if cluster is up (True) or not (False) cluster_up = conn.cluster_is_up() # Check if cluster is healthy cluster_healthy = all(node["healthy"] for node in conn.info().nodes) # Logical test to see if status is healthy or not
try: s = socket.socket() s.connect(("127.0.0.1", 54321)) h2o_launched = True except Exception as e: time.sleep(6) if i % 5 == 0: print("Attempt {}: H2O-3 not running yet...".format(i)) if i > 30: raise Exception("""Could not connect to H2O Cluster in {} attempts Last Error: {}""".format(i, e)) i += 1 finally: s.close() h2o.connect(url="http://127.0.0.1:54321") class ScoringService(object): model = None # Where we keep the model when it's loaded @classmethod def get_model(cls): """Get the model object for this instance, loading it if it's not already loaded.""" if cls.model is None: for file in os.listdir(model_path): # Assumes that 'AutoML' is somewhere in the filename of a # model that's been generated. We just load the first model # that satisfies this constraint, so caveat emptor if you've # run the 'train' script multiple times - this may still load
#! /usr/bin/python import h2o import sys import os import pandas as pd cwd = os.getcwdu() dataset = cwd + "/dataset/unsw-nb15_mereged.zip" h2o.connect(ip="localhost", port="54321") columns_types = [ "enum", "numeric", "enum", "numeric", "enum", "enum", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "enum", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "time", "time", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "numeric", "enum", "enum" ] data = h2o.import_file( path=dataset, destination_frame="unsw-nb15_mereged", col_types=columns_types, ) response_column = "label" predictor_columns = data.names predictor_columns.remove(response_column) predictor_columns.remove("srcip") predictor_columns.remove("dstip") predictor_columns.remove("attack_cat") # training, test = data.split_frame(
import pandas as pd import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator con = h2o.connect(url='http://192.168.5.208:54321/') csv_data = pd.read_csv('股票数据/处理后数据/processed_601857.csv', encoding='utf8') csv_data['earn'] = csv_data['20_closing_price'] > csv_data['closing_price']*1.2 csv_data_ = h2o.H2OFrame(csv_data) model = H2OGradientBoostingEstimator(model_id='stock_601857', nfolds=10, distribution = "bernoulli", ntrees = 2000, max_depth = 10, learn_rate = 0.4, histogram_type = "UniformAdaptive", min_split_improvement = 0.000001, balance_classes = False, seed = 52345, stopping_rounds = 5, stopping_metric = 'AUC', stopping_tolerance = 0.001, col_sample_rate = 0.6, col_sample_rate_per_tree = 0.6, col_sample_rate_change_per_level = 0.6, sample_rate = 0.85, min_rows = 100, ) traning_data, test_data = csv_data_.split_frame(ratios=[0.8], destination_frames=["train_frame", "test_data"]) csv_data.keys() model.train(x=['closing_price', 'upping_ratio', 'changing_ratio', 'volume', 'upping_ratio1', 'upping_ratio2', 'upping_ratio3', 'upping_ratio4', 'upping_ratio5', 'A_index_closing_price', 'A_index_upping_money', 'A_index_upping_ratio', 'A_index_volume', 'A_index_volume_money', 'B_index_closing_price', 'B_index_upping_money', 'B_index_upping_ratio', 'B_index_volume', 'B_index_volume_money', 'top50_index_closing_price', 'top50_index_upping_money', 'top50_index_upping_ratio', 'top50_index_volume', 'top50_index_volume_money', 'sh_index_closing_price', 'sh_index_upping_money',
OS/client interface combination. Below is the test that will be implemented: """ from __future__ import print_function import sys sys.path.insert(0, "../..") import h2o from h2o.backend import H2OLocalServer from h2o.exceptions import H2OConnectionError PORT = 55330 # Check whether there is already an instance running at the specified port, and if so shut it down. try: conn = h2o.connect(ip="localhost", port=PORT) conn.shutdown_server(prompt=False) except H2OConnectionError: pass # Now start a new H2O server and connect to it. server = H2OLocalServer.start(port=str(PORT) + "+") conn = h2o.connect(server=server) # Get if cluster is up (True) or not (False) cluster_up = conn.cluster_is_up() # Check if cluster is healthy cluster_healthy = all(node["healthy"] for node in conn.info().nodes) # Logical test to see if status is healthy or not
def processAudio(): import librosa import pandas as pd import numpy as np import librosa.display import parselmouth from parselmouth.praat import call from parselmouth import MFCC import matplotlib.pyplot as plt import h2o from h2o.grid.grid_search import H2OGridSearch import pandas as pd import numpy as np from sklearn import preprocessing from h2o.estimators.deeplearning import H2OAutoEncoderEstimator, H2ODeepLearningEstimator import os h2o.init(ip="127.0.0.1",max_mem_size_GB = 2) h2o.connect() f0min,f0max=70,600 unit="Hertz" wave_file='Audio5780917.wav' y, sr = librosa.load(wave_file) time=librosa.get_duration(y=y, sr=sr) sound = parselmouth.Sound(wave_file) print("Processing {}...".format(wave_file)) duration = call(sound, "Get total duration") # duration #ff0min, f0max=75,600 default pitch = call(sound, "To Pitch", 0.0, f0min, f0max) #create a praat pitch object pitchMean = call(pitch, "Get mean", 0, 0, unit) # get mean pitch PitchStdev = call(pitch, "Get standard deviation", 0 ,0, unit) # get standard deviation # mfcc = call(sound , 'To MelSpectrogram...', 0, 0, 0.0001, 0.02, 1.3, 1.6) harmonicity = call(sound, "To Harmonicity (cc)", 0.01, f0min, 0.1, 1.0) hnr = call(harmonicity, "Get mean", 0, 0) pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max) localJitter = call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3) localabsoluteJitter = call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3) rapJitter = call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3) ppq5Jitter = call(pointProcess, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3) ddpJitter = call(pointProcess, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3) localShimmer = call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6) localdbShimmer = call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6) apq3Shimmer = call([sound, pointProcess], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6) aqpq5Shimmer = call([sound, pointProcess], "Get shimmer (apq5)", 0, 0, 0.0001, 0.02, 1.3, 1.6) apq11Shimmer = call([sound, pointProcess], "Get shimmer (apq11)", 0, 0, 0.0001, 0.02, 1.3, 1.6) ddaShimmer = call([sound, pointProcess], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6) formants = call(sound, "To Formant (burg)", 0.0025, 5, 5000, 0.025, 50) numPoints = call(pointProcess, "Get number of points") f1_list = [] f2_list = [] f3_list = [] f4_list = [] # Measure formants only at glottal pulses for point in range(0, numPoints): point += 1 t = call(pointProcess, "Get time from index", point) f1 = call(formants, "Get value at time", 1, t, 'Hertz', 'Linear') f2 = call(formants, "Get value at time", 2, t, 'Hertz', 'Linear') f3 = call(formants, "Get value at time", 3, t, 'Hertz', 'Linear') f4 = call(formants, "Get value at time", 4, t, 'Hertz', 'Linear') f1_list.append(f1) f2_list.append(f2) f3_list.append(f3) f4_list.append(f4) f1_list = [f1 for f1 in f1_list if str(f1) != 'nan'] f2_list = [f2 for f2 in f2_list if str(f2) != 'nan'] f3_list = [f3 for f3 in f3_list if str(f3) != 'nan'] f4_list = [f4 for f4 in f4_list if str(f4) != 'nan'] # calculate mean formants across pulses f1_mean = np.mean(f1_list) f2_mean = np.mean(f2_list) f3_mean = np.mean(f3_list) f4_mean = np.mean(f4_list) chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr) rmse = librosa.feature.rms(y=y) spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr) spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr) rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr) zcr = librosa.feature.zero_crossing_rate(y) mfcc = librosa.feature.mfcc(y=y, sr=sr) df = pd.DataFrame([[pitchMean ,PitchStdev , hnr , np.mean(chroma_stft) , np.mean(rmse) , np.mean(spec_cent) , np.mean(spec_bw) , np.mean(rolloff) , np.round(localJitter,6) , np.round(localabsoluteJitter,6) , np.round(rapJitter,6) , np.round(ppq5Jitter,6) , np.round(ddpJitter,6) , np.round(localShimmer,6) , np.round(localdbShimmer,6) , np.round(aqpq5Shimmer,6) , np.round(apq11Shimmer,6) , np.round(ddaShimmer,6) , f1_mean , f2_mean , f3_mean , f4_mean, mfcc[0].mean(),mfcc[1].mean(),mfcc[2].mean(), mfcc[3].mean(), mfcc[4].mean(),mfcc[5].mean(),mfcc[6].mean(),mfcc[7].mean(),mfcc[8].mean(),mfcc[9].mean(),mfcc[10].mean(), mfcc[11].mean(),mfcc[12].mean(),mfcc[13].mean(),mfcc[14].mean(),mfcc[15].mean(),mfcc[16].mean(), mfcc[17].mean(),mfcc[18].mean(),mfcc[19].mean()]] , columns=['pitchMean' ,'pitchStdev', 'hnr', 'chroma_stft' ,'rmse' ,'spectral_centroid' , 'spectral_bandwidth', 'rolloff', 'localJitter', 'localabsoluteJitter' ,'rapJitter', 'ppq5Jitter' ,'ddpJitter' , 'localShimmer' ,'localdbShimmer' ,'aqpq5Shimmer' ,'apq11Shimmer', 'ddaShimmer' ,'formant1Mean' ,'formant2Mean' , 'formant3Mean' ,'formant4Mean', 'mfcc1','mfcc2','mfcc3','mfcc4','mfcc5','mfcc6','mfcc7','mfcc8','mfcc9','mfcc10', 'mfcc11','mfcc12','mfcc13','mfcc14','mfcc15','mfcc16','mfcc17','mfcc18','mfcc19','mfcc20']) df.fillna(0) hf = h2o.H2OFrame(df) #min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1)) #X_scale = min_max_scaler.fit_transform(df) #saved_model = load_model('ANNModel.h5') saved_model = h2o.load_model('DNNH2OModel') testPerformance=saved_model.predict(hf) #testPerformance=saved_model.predict_classes(X_scale) prediction=testPerformance.as_data_frame() predict=prediction['predict'][0] if (predict == 'NoParkinson'): data={'df':'Congrats test is Negative'} if (predict == 'Parkinson'): data={'df':'Need to go for deep testing'} #predict=testPerformance[0] #tt=predict[0] #if(tt==1): # data={'df':'Test Positve Go For Deep Testing'} #if(tt==0): # data={'df':'Congrats Test is Negative'} data = jsonify(data) return data
def test(): host_port = os.environ["cloud_ip_port_main"] main_uri = "%s/main" % host_port username = "******" password = "******" xgb_host_port = "%s/xgb" % os.environ["cloud_ip_port_xgb"] xgb_username = "******" xgb_password = "******" h2o.connect(url="http://%s" % main_uri, auth=(username, password)) # hello test steam = MockSteam(main_uri, username, password) steam.send({"_type": "hello", "_id": "hi_1"}) hello_response = steam.wait_for_message() assert hello_response is not None, "No hello response sent." assert "hello_response" == hello_response["_type"] assert "hi_1_response" == hello_response["_id"] # load data name_node = pyunit_utils.hadoop_namenode() train = h2o.import_file("hdfs://" + name_node + "/datasets/chicagoCensus.csv") x = list(range(0, train.ncol - 1)) y = train.ncol - 1 train = train[~train[y].isna(), :] model1 = H2OXGBoostEstimator(ntrees=5) model2 = H2OXGBoostEstimator(ntrees=5) # make sure H2O thinks there is no cluster running steam.send(make_stop_req("stop_check")) stop_resp = steam.wait_for_message() assert stop_resp is not None, "No stop response" assert "stopXGBoostClusterConfirmation" == stop_resp["_type"] assert stop_resp["allowed"] is not None # response could be anything here # steam does not respond model1.start(x=x, y=y, training_frame=train) start_req = steam.wait_for_message() assert start_req is not None, "No start request sent" assert "startXGBoostCluster" == start_req["_type"] time.sleep(20) try: model1.join() assert False, "Model train did not fail when steam did not respond" except Exception as e: print(e) assert True, "Jon failed as expected" # xgboost happy path model1.start(x=x, y=y, training_frame=train) start_req_1 = steam.wait_for_message() assert start_req_1 is not None, "No start request sent" assert "startXGBoostCluster" == start_req_1["_type"] steam.send(make_starting_response(start_req_1)) time.sleep(10) steam.send( make_started_response(start_req_1, "started", xgb_host_port, xgb_username, xgb_password)) model1.join() steam.send(make_stop_req("stop_01")) stop_resp_1 = steam.wait_for_message() assert stop_resp_1 is not None, "No stop response" assert "stopXGBoostClusterConfirmation" == stop_resp_1["_type"] assert "true" == stop_resp_1["allowed"] # another train should trigger another cluster start model1.start(x=x, y=y, training_frame=train) start_req_2 = steam.wait_for_message() assert start_req_2 is not None, "No start request sent" assert "startXGBoostCluster" == start_req_2["_type"] steam.send( make_started_response(start_req_2, "started", xgb_host_port, xgb_username, xgb_password)) model1.join() model2.start(x=x, y=y, training_frame=train) assert steam.wait_for_message( ) is None, "Should not sent start request for another job" model2.join() steam.send(make_stop_req("stop_02")) stop_resp_2 = steam.wait_for_message() assert stop_resp_2 is not None, "No stop response" assert "stopXGBoostClusterConfirmation" == stop_resp_2["_type"] assert "true" == stop_resp_2["allowed"] # starting of cluster fails model1.start(x=x, y=y, training_frame=train) start_req_3 = steam.wait_for_message() assert start_req_3 is not None, "No start request sent" assert "startXGBoostCluster" == start_req_3["_type"] steam.send( make_started_response(start_req_3, "failed", xgb_host_port, xgb_username, xgb_password, reason="Testing, testing")) try: model1.join() assert False, "Model train did not fail when steam responded with failure" except Exception as e: print(e) assert True, "Jon failed as expected" # cleanup steam.close()
thresholdAllowed = 0.05 while fn/(tn+fn) > thresholdAllowed: confMtx = m.confusion_matrix(xval=True, thresholds=[f1Threshold]).to_list() fn = confMtx[1][0] tn = confMtx[1][1] #f1Threshold -= (f1Threshold * 0.1 + 0.0001) f1Threshold *= 0.9 if f1Threshold <= 0.00002: break # print(f1Threshold, round(fn/(fn+tn)*100, 2), '% false negatives') # print(m.confusion_matrix(xval=True, thresholds=[f1Threshold])) bRate = str(round(confMtx[0][1] / (confMtx[0][0] + confMtx[0][1]) * 100, 1)) mRate = str(round(confMtx[1][0] / (confMtx[1][0] + confMtx[1][1]) * 100, 1)) return "At threshold of " + str(round(f1Threshold, 5))+", mRate of " + mRate + "%, bRate of " + bRate+"%" h2o.connect(ip='xcnd14.comp.nus.edu.sg') accuracy = [] for i in range(56): m = h2o.get_model('grid-dl_model_'+str(i)) accuracy.append(getAccuracy(m)) # print(round(m.auc(xval=True), 3), round(m.logloss(xval = True),3), round(m.accuracy(xval=True)[0][1],4)) # print(i, m.logloss(xval = True), m.auc(xval = True)) for i in range(len(accuracy)): print(i, accuracy[i]) accuracy = [] for i in [52, 40, 36,24,48,54,44,28,20,53]: accuracy.append(getAccuracy(h2o.get_model('grid-dl_model_'+str(i))))
def test_auto_recovery(self): name_node = pyunit_utils.hadoop_namenode() dataset = "/datasets/iris_wheader.csv" ntrees_opts = [100, 120, 130, 140] learn_rate_opts = [0.01, 0.02, 0.03, 0.04] grid_size = len(ntrees_opts) * len(learn_rate_opts) print("max models %s" % grid_size) grid_id = "grid_ft_auto_recover" hyper_parameters = { "learn_rate": learn_rate_opts, "ntrees": ntrees_opts } cluster_1_name = "grid-auto-1-py" try: cluster_1 = utils.start_cluster(cluster_1_name, enable_auto_recovery=True, clean_auto_recovery=True) print("initial cluster started at %s" % cluster_1) h2o.connect(url=cluster_1) train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset)) grid = H2OGridSearch(H2OGradientBoostingEstimator, grid_id=grid_id, hyper_params=hyper_parameters) bg_train_thread = threading.Thread(target=self._training_thread, kwargs={ "grid": grid, "train": train }) bg_train_thread.start() phase_1_models = self._wait_for_model_to_build(grid_id) self._print_models("Initial models", phase_1_models) assert len(phase_1_models) > 0 self._check_training_error() finally: utils.stop_cluster(cluster_1_name) cluster_2_name = "grid-auto-2-py" try: cluster_2 = utils.start_cluster(cluster_2_name, enable_auto_recovery=True) print("cluster resumed at %s, should unblock background thread" % cluster_2) phase_2_models = self._wait_for_model_to_build( grid_id, len(phase_1_models) + 1) self._print_models("Recovery #1 models", phase_2_models) assert len(phase_2_models) > len(phase_1_models) self._check_training_error() finally: utils.stop_cluster(cluster_2_name) cluster_3_name = "grid-auto-3-py" try: cluster_3 = utils.start_cluster(cluster_3_name, enable_auto_recovery=True) print("cluster resumed at %s, waiting for training to finish" % cluster_3) bg_train_thread.join() print("models after final run:") for x in sorted(grid.model_ids): print(x) print("Finished grained grid has %d models" % len(grid.model_ids)) self.assertEqual(len(grid.model_ids), grid_size, "The full grid was not trained.") self._check_training_error() h2o.connection().close() finally: utils.stop_cluster(cluster_3_name)
# ----common init---- import h2o import sys import time import socket ip = socket.gethostbyname(socket.gethostname()) # ----1M CSV ---- h2o.connect(ip=sys.argv[1], port=sys.argv[2]) start1Mcsv = time.time() ds1Mcsv = h2o.import_file("hdfs://" + ip + "/user/hadoop/1M/data1M.csv") end1Mcsv = time.time() print '================results for 1M rows===============' print 'HDFS: the import of 1M rows from CSV/HDFS took', end1Mcsv - start1Mcsv, 's' print '================results for 1M rows==============='
datasets = "Audit,Auto,Housing,Iris,Sentiment,Versicolor,Wheat" with_h2o = False if __name__ == "__main__": if len(sys.argv) > 1: datasets = sys.argv[1] if len(sys.argv) > 2: with_h2o = "H2O" in sys.argv[2] datasets = datasets.split(",") if with_h2o: h2o.init() h2o.connect() # # Clustering # wheat_X, wheat_y = load_wheat("Wheat") def kmeans_distance(kmeans, center, X): return numpy.sum(numpy.power(kmeans.cluster_centers_[center] - X, 2), axis = 1) def build_wheat(kmeans, name, with_affinity = True, **pmml_options): mapper = DataFrameMapper([ (wheat_X.columns.values, [ContinuousDomain(dtype = float), IdentityTransformer()]) ]) scaler = ColumnTransformer([
distribution="multinomial", seed=1234) air_model.train(x=myX, y=targetColumnName, training_frame=encodedTrain, validation_frame=encodedValid) variable_importance = air_model._model_json['output']['variable_importances'].as_data_frame() # print(variable_importance) my_gbm_metrics = air_model.model_performance(encodedTest) auc = my_gbm_metrics.auc() sum_of_aucs += auc print("AUC with none(holdout) for seed: " + str(current_seed) + " = " + str(auc)) return sum_of_aucs / len(seeds) if __name__ == "__main__": h2o.connect() titanic = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) runs = 1 # Set to a bigger value to get more objective resuts. seeds = random.sample(range(1, 10000), runs) without_te = titanic_without_te(titanic, seeds) kfold_strategy = titanic_with_te_kfoldstrategy(titanic, seeds) loo_strategy = titanic_with_te_loostrategy(titanic, seeds) none_strategy = titanic_with_te_nonestrategy(titanic, seeds) print("\n\nReport was generated based on average values from " + str(runs) + " runs that depends on the same set of seeds") print("AUC without te: " + str(without_te)) print("AUC with kfold: " + str(kfold_strategy))