def _get_cluster_status_info_values(self): if self._retrieved_at + self.REFRESH_INTERVAL < time.time(): # Info is stale, need to refresh new_info = h2o.api("GET /3/Cloud") self._fill_from_h2ocluster(new_info) ncpus = sum(node["num_cpus"] for node in self.nodes) allowed_cpus = sum(node["cpus_allowed"] for node in self.nodes) free_mem = sum(node["free_mem"] for node in self.nodes) unhealthy_nodes = sum(not node["healthy"] for node in self.nodes) status = "locked" if self.locked else "accepting new members" if unhealthy_nodes == 0: status += ", healthy" else: status += ", %d nodes are not healthy" % unhealthy_nodes api_extensions = self.list_api_extensions() values = [ get_human_readable_time(self.cloud_uptime_millis), self.cloud_internal_timezone, self.datafile_parser_timezone, self.version, "{} {}".format(self.build_age, ("!!!" if self.build_too_old else "")), self.cloud_name, self.cloud_size, get_human_readable_bytes(free_mem), ncpus, allowed_cpus, status, h2o.connection().base_url, json.dumps(h2o.connection().proxy), self.internal_security_enabled, ', '.join(api_extensions), "%d.%d.%d %s" % tuple(sys.version_info[:4]) ] return values
def stop(self, stopSparkContext=False): h2o.connection().close() scalaStopMethod = getattr(self._jhc, "ai$h2o$sparkling$H2OContext$$stop") scalaStopMethod( stopSparkContext, False, False ) # stopSpark = False, stopJVM = False, inShutdownHook = False
def stop(self): h2o.connection().close() scalaStopMethod = getattr(self._jhc, "org$apache$spark$h2o$H2OContext$$stop") scalaStopMethod( False, False, False ) # stopSpark = False, stopJVM = False, inShutdownHook = False
def test_frame_reload(self): name_node = pyunit_utils.hadoop_namenode() work_dir = "hdfs://%s%s" % (name_node, utils.get_workdir()) dataset = "/datasets/iris_wheader.csv" ntrees_opts = [100, 120, 130, 140] learn_rate_opts = [0.01, 0.02, 0.03, 0.04] grid_size = len(ntrees_opts) * len(learn_rate_opts) print("max models %s" % grid_size) grid_id = "grid_ft_resume" hyper_parameters = { "learn_rate": learn_rate_opts, "ntrees": ntrees_opts } cluster_1_name = "grid1-py" try: cluster_1 = utils.start_cluster(cluster_1_name) h2o.connect(url=cluster_1) train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset)) grid = H2OGridSearch(H2OGradientBoostingEstimator, grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=work_dir) print("starting initial grid and sleeping...") grid.start(x=list(range(4)), y=4, training_frame=train) grid_in_progress = None times_waited = 0 while (times_waited < 20) and (grid_in_progress is None or len( grid_in_progress.model_ids) == 0): time.sleep(5) # give it tome to train some models times_waited += 1 try: grid_in_progress = h2o.get_grid(grid_id) except IndexError: print("no models trained yet") print("done sleeping") h2o.connection().close() finally: utils.stop_cluster(cluster_1_name) cluster_2_name = "grid2-py" try: cluster_2 = utils.start_cluster(cluster_2_name) h2o.connect(url=cluster_2) loaded = h2o.load_grid("%s/%s" % (work_dir, grid_id), load_params_references=True) print("models after first run:") for x in sorted(loaded.model_ids): print(x) loaded.resume() print("models after second run:") for x in sorted(loaded.model_ids): print(x) print("Newly grained grid has %d models" % len(loaded.model_ids)) self.assertEqual(len(loaded.model_ids), grid_size, "The full grid was not trained.") h2o.connection().close() finally: utils.stop_cluster(cluster_2_name)
def test_frame_reload(self): name_node = utils.hadoop_namenode() work_dir = utils.get_workdir() dataset = "/datasets/mnist/train.csv.gz" saver_cluster_name = "saver-py" try: cluster_1 = utils.start_cluster(saver_cluster_name) h2o.connect(url=cluster_1) df_orig = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset)) df_key = df_orig.key df_pd_orig = df_orig.as_data_frame() df_orig.save(work_dir) h2o.connection().close() finally: utils.stop_cluster(saver_cluster_name) loader_cluster_name = "loader-py" try: cluster_2 = utils.start_cluster(loader_cluster_name) h2o.connect(url=cluster_2) df_loaded = h2o.load_frame(df_key, work_dir) df_pd_loaded = df_loaded.as_data_frame() h2o.connection().close() finally: utils.stop_cluster(loader_cluster_name) self.assertTrue(df_pd_orig.equals(df_pd_loaded))
def stop(self): h2o.connection().close() hc = self._jhc.h2oContext() scalaStopMethod = getattr(hc, "org$apache$spark$h2o$H2OContext$$stop") scalaStopMethod( False, False, False ) # stopSpark = False, stopJVM = False, inShutdownHook = False if self._conf.get("spark.ext.h2o.rest.api.based.client", "false") == "false": sys.exit()
def test_isna(): nan = float("nan") frame = h2o.H2OFrame.from_python(OrderedDict([ ("A", [1, 0, 3, 4, 8, 4, 7]), ("B", [2, nan, -1, nan, nan, 9, 0]), ("C", ["one", "", "two", "", "seventeen", "1", ""]), ("D", ["oneteen", "", "twoteen", "", "sixteen", "twenteen", ""]) ]), na_strings=[""], column_types={ "C": "enum", "D": "string" }) assert frame.shape == (7, 4) assert frame.names == ["A", "B", "C", "D"] assert frame.types == { "A": "int", "B": "int", "C": "enum", "D": "string" }, "Actual types: %r" % frame.types isna = frame.isna() rc = h2o.connection().requests_count assert isna.shape == (7, 4) assert isna.names == ["isNA(A)", "isNA(B)", "isNA(C)", "isNA(D)"] # at some point we'll switch to 'bool' column type assert isna.types == {"isNA(A)": "int", "isNA(B)": "int", "isNA(C)": "int", "isNA(D)": "int"}, \ "Actual types: %r" % isna.types assert h2o.connection( ).requests_count == rc, "Frame isna should not be evaluated yet!" print() print(isna) assert isna.shape == (7, 4) assert isna.names == ["isNA(A)", "isNA(B)", "isNA(C)", "isNA(D)"] assert isna.types == { "isNA(A)": "int", "isNA(B)": "int", "isNA(C)": "int", "isNA(D)": "int" } df = isna.as_data_frame(use_pandas=False, header=False) assert df == [ ["0", "0", "0", "0"], ["0", "1", "1", "1"], ["0", "0", "0", "0"], ["0", "1", "1", "1"], ["0", "1", "0", "0"], ["0", "0", "0", "0"], ["0", "0", "1", "1"], ]
def is_running(self): """ Determine if the H2O cluster is running or not. :returns: True if the cluster is up; False otherwise """ try: if h2o.connection().local_server and not h2o.connection().local_server.is_running(): return False h2o.api("GET /") return True except (H2OConnectionError, H2OServerError): return False
def show_status(self, detailed=False): """ Print current cluster status information. :param detailed: if True, then also print detailed information about each node. """ if self._retrieved_at + self.REFRESH_INTERVAL < time.time(): # Info is stale, need to refresh new_info = h2o.api("GET /3/Cloud") self._fill_from_h2ocluster(new_info) ncpus = sum(node["num_cpus"] for node in self.nodes) allowed_cpus = sum(node["cpus_allowed"] for node in self.nodes) free_mem = sum(node["free_mem"] for node in self.nodes) unhealthy_nodes = sum(not node["healthy"] for node in self.nodes) status = "locked" if self.locked else "accepting new members" if unhealthy_nodes == 0: status += ", healthy" else: status += ", %d nodes are not healthy" % unhealthy_nodes api_extensions = self.list_api_extensions() H2ODisplay([ ["H2O cluster uptime:", get_human_readable_time(self.cloud_uptime_millis)], ["H2O cluster timezone:", self.cloud_internal_timezone], ["H2O data parsing timezone:", self.datafile_parser_timezone], ["H2O cluster version:", self.version], ["H2O cluster version age:", "{} {}".format(self.build_age, ("!!!" if self.build_too_old else ""))], ["H2O cluster name:", self.cloud_name], ["H2O cluster total nodes:", self.cloud_size], ["H2O cluster free memory:", get_human_readable_bytes(free_mem)], ["H2O cluster total cores:", str(ncpus)], ["H2O cluster allowed cores:", str(allowed_cpus)], ["H2O cluster status:", status], ["H2O connection url:", h2o.connection().base_url], ["H2O connection proxy:", h2o.connection().proxy], ["H2O internal security:", self.internal_security_enabled], ["H2O API Extensions:", ', '.join(api_extensions)], ["Python version:", "%d.%d.%d %s" % tuple(sys.version_info[:4])], ]) if detailed: keys = ["h2o", "healthy", "last_ping", "num_cpus", "sys_load", "mem_value_size", "free_mem", "pojo_mem", "swap_mem", "free_disk", "max_disk", "pid", "num_keys", "tcps_active", "open_fds", "rpcs_active"] header = ["Nodes info:"] + ["Node %d" % (i + 1) for i in range(len(self.nodes))] table = [[k] for k in keys] for node in self.nodes: for i, k in enumerate(keys): table[i].append(node[k]) H2ODisplay(table=table, header=header)
def h2oconnection(): """ Python API test: h2o.connection() """ # call with no arguments temp = h2o.connection() assert_is_type(temp, H2OConnection)
def redirect_relative(): if os.environ.get('JOB_NAME') and os.environ.get('JOB_NAME').startswith( "h2o-3-kerberos-smoke-pipeline/"): h2o.log_and_echo( "Skipping test 'redirect_relative' on Kerberos pipeline (it is not configured with form_auth)" ) return conn = h2o.connection() # get default requests arguments req_args = conn._request_args() headers = req_args["headers"] headers["User-Agent"] = "Mozilla/pyunit" # invalidate authentication req_args["auth"] = None req_args["headers"] = headers response_flow = requests.request("GET", conn._base_url + "/flow/index.html", allow_redirects=False, **req_args) print(response_flow) assert response_flow.status_code in [302, 303] assert response_flow.headers["location"].startswith("/login")
def _get_ast_str(self, top): if not self._cache.is_empty( ): # Data already computed and cached; could a "false-like" cached value return str(self._cache._data) if self._cache.is_scalar( ) else self._cache._id if self._cache._id is not None: return self._cache._id # Data already computed under ID, but not cached assert isinstance(self._children, tuple) exec_str = "({} {})".format( self._op, " ".join([ExprNode._arg_to_expr(ast) for ast in self._children])) def is_ast_expr(ref): return isinstance(ref, list) and any( map(lambda r: isinstance(r, ASTId), ref)) referrers = gc.get_referrers(self) # removing frames from the referrers to get a consistent behaviour accross Py versions # as stack frames don't appear in the referrers from Py 3.7. # also removing the AST expressions built in astfun.py # as they keep a reference to self if the lambda itself is using a free variable. proper_ref = [ r for r in referrers if not (inspect.isframe(r) or is_ast_expr(r)) ] ref_cnt = len(proper_ref) del referrers, proper_ref # if this self node is referenced by at least one other node (nested expr), then create a tmp frame if top or ref_cnt > 1: self._cache._id = _py_tmp_key(append=h2o.connection().session_id) exec_str = "(tmp= {} {})".format(self._cache._id, exec_str) return exec_str
def rapids(expr): """ Execute a Rapids expression. :param expr: The rapids expression (ascii string). :returns: The JSON response (as a python dictionary) of the Rapids execution """ return h2o.api("POST /99/Rapids", data={"ast": expr, "session_id": h2o.connection().session_id})
def show_status(self, detailed=False): """ Print current cluster status information. :param detailed: if True, then also print detailed information about each node. """ if self._retrieved_at + self.REFRESH_INTERVAL < time.time(): # Info is stale, need to refresh new_info = h2o.api("GET /3/Cloud") self._fill_from_h2ocluster(new_info) ncpus = sum(node["num_cpus"] for node in self.nodes) allowed_cpus = sum(node["cpus_allowed"] for node in self.nodes) free_mem = sum(node["free_mem"] for node in self.nodes) unhealthy_nodes = sum(not node["healthy"] for node in self.nodes) status = "locked" if self.locked else "accepting new members" if unhealthy_nodes == 0: status += ", healthy" else: status += ", %d nodes are not healthy" % unhealthy_nodes H2ODisplay([ ["H2O cluster uptime:", get_human_readable_time(self.cloud_uptime_millis)], ["H2O cluster version:", self.version], ["H2O cluster version age:", "{} {}".format(self.build_age, ("!!!" if self.build_too_old else ""))], ["H2O cluster name:", self.cloud_name], ["H2O cluster total nodes:", self.cloud_size], ["H2O cluster free memory:", get_human_readable_bytes(free_mem)], ["H2O cluster total cores:", str(ncpus)], ["H2O cluster allowed cores:", str(allowed_cpus)], ["H2O cluster status:", status], ["H2O connection url:", h2o.connection().base_url], ["H2O connection proxy:", h2o.connection().proxy], ["H2O internal security:", self.internal_security_enabled], ["Python version:", "%d.%d.%d %s" % tuple(sys.version_info[:4])], ]) if detailed: keys = ["h2o", "healthy", "last_ping", "num_cpus", "sys_load", "mem_value_size", "free_mem", "pojo_mem", "swap_mem", "free_disk", "max_disk", "pid", "num_keys", "tcps_active", "open_fds", "rpcs_active"] header = ["Nodes info:"] + ["Node %d" % (i + 1) for i in range(len(self.nodes))] table = [[k] for k in keys] for node in self.nodes: for i, k in enumerate(keys): table[i].append(node[k]) H2ODisplay(table=table, header=header)
def h2oconnection(): """ Python API test: h2o.connection() """ # call with no arguments try: temp = h2o.connection() assert_is_type(temp, H2OConnection) except Exception as e: assert False, "h2o.connection() command is not working."
def shutdown(self, prompt=False): """ Shut down the server. This method checks if the H2O cluster is still running, and if it does shuts it down (via a REST API call). :param prompt: A logical value indicating whether to prompt the user before shutting down the H2O server. """ if not self.is_running(): return assert_is_type(prompt, bool) if prompt: question = "Are you sure you want to shutdown the H2O instance running at %s (Y/N)? " \ % h2o.connection().base_url response = input(question) # works in Py2 & Py3 because redefined in h2o.utils.compatibility module else: response = "Y" if response.lower() in {"y", "yes"}: h2o.api("POST /3/Shutdown") h2o.connection().close()
def _do_it(self, top): if not self._cache.is_empty(): # Data already computed and cached; could a "false-like" cached value return str(self._cache._data) if self._cache.is_scalar() else self._cache._id if self._cache._id is not None: return self._cache._id # Data already computed under ID, but not cached # assert isinstance(self._children,tuple) exec_str = "({} {})".format(self._op, " ".join([ExprNode._arg_to_expr(ast) for ast in self._children])) gc_ref_cnt = len(gc.get_referrers(self)) if top or gc_ref_cnt >= ExprNode.MAGIC_REF_COUNT: self._cache._id = _py_tmp_key(append=h2o.connection().session_id) exec_str = "(tmp= {} {})".format(self._cache._id, exec_str) return exec_str
def _get_ast_str(self, top): if not self._cache.is_empty(): # Data already computed and cached; could a "false-like" cached value return str(self._cache._data) if self._cache.is_scalar() else self._cache._id if self._cache._id is not None: return self._cache._id # Data already computed under ID, but not cached # assert isinstance(self._children,tuple) exec_str = "({} {})".format(self._op, " ".join([ExprNode._arg_to_expr(ast) for ast in self._children])) gc_ref_cnt = len(gc.get_referrers(self)) if top or gc_ref_cnt >= ExprNode.MAGIC_REF_COUNT: self._cache._id = _py_tmp_key(append=h2o.connection().session_id) exec_str = "(tmp= {} {})".format(self._cache._id, exec_str) return exec_str
def test_isna(): nan = float("nan") frame = h2o.H2OFrame.from_python(OrderedDict([ ("A", [1, 0, 3, 4, 8, 4, 7]), ("B", [2, nan, -1, nan, nan, 9, 0]), ("C", ["one", "", "two", "", "seventeen", "1", ""]), ("D", ["oneteen", "", "twoteen", "", "sixteen", "twenteen", ""]) ]), na_strings=[""], column_types={"C": "enum", "D": "string"}) assert frame.shape == (7, 4) assert frame.names == ["A", "B", "C", "D"] assert frame.types == {"A": "int", "B": "int", "C": "enum", "D": "string"}, "Actual types: %r" % frame.types isna = frame.isna() rc = h2o.connection().requests_count assert isna.shape == (7, 4) assert isna.names == ["isNA(A)", "isNA(B)", "isNA(C)", "isNA(D)"] # at some point we'll switch to 'bool' column type assert isna.types == {"isNA(A)": "int", "isNA(B)": "int", "isNA(C)": "int", "isNA(D)": "int"}, \ "Actual types: %r" % isna.types assert h2o.connection().requests_count == rc, "Frame isna should not be evaluated yet!" print() print(isna) assert isna.shape == (7, 4) assert isna.names == ["isNA(A)", "isNA(B)", "isNA(C)", "isNA(D)"] assert isna.types == {"isNA(A)": "int", "isNA(B)": "int", "isNA(C)": "int", "isNA(D)": "int"} df = isna.as_data_frame(use_pandas=False, header=False) assert df == [ ["0", "0", "0", "0"], ["0", "1", "1", "1"], ["0", "0", "0", "0"], ["0", "1", "1", "1"], ["0", "1", "0", "0"], ["0", "0", "0", "0"], ["0", "0", "1", "1"], ]
def connect_invalid(): current = h2o.connection() print(current._auth) if current._auth is None: print("Skipping test running in non-authenticated environment") return invalid = copy.copy(current) # Not invalid yet - first do sanity check that original connection can be used to make a request invalid.request("GET /3/About") auth_user = invalid._auth[0] if isinstance( invalid._auth, tuple) else "jenkins" # fallback for CI testing invalid._auth = (auth_user, "invalid-password") # Test with invalid basic auth err = None try: invalid._auth = ("invalid-user", "invalid-password") invalid.request("GET /3/About") except H2OServerError as e: err = e assert err is not None msg = str(err.args[0]) print("<Error message>") print(msg) print("</Error Message>") assert msg.startswith("HTTP 401") # Unauthorized # Test without any auth err = None try: invalid._auth = None invalid.request("GET /3/About") except H2OServerError as e: err = e assert err is not None msg = str(err.args[0]) print("<Error message>") print(msg) print("</Error Message>") assert msg.startswith("HTTP 401") # Unauthorized
def test_import_from_long_urls(): prostate_path = pyunit_utils.locate("smalldata/logreg/prostate.csv") prostate = h2o.import_file(path=prostate_path) padding = "x" * 512 # get the data from H2O to avoid making calls to public internet conn = h2o.connection() url = conn._base_url + "/3/DownloadDataset?frame_id=%s&hex_string=false&padding=" % prostate.frame_id + padding prostate_from_self = h2o.import_file(url) assert_frame_equal(prostate_from_self.as_data_frame(), prostate.as_data_frame())
def to_pojo(self, pojo_name="", path="", get_jar=True): if pojo_name == "": pojo_name = "AssemblyPOJO_" + str(uuid.uuid4()) java = h2o.api("GET /99/Assembly.java/%s/%s" % (self.id, pojo_name)) file_path = path + "/" + pojo_name + ".java" if path == "": print(java) else: with open(file_path, 'w', encoding="utf-8") as f: f.write(java) # this had better be utf-8 ? if get_jar and path != "": url = h2o.connection().make_url("h2o-genmodel.jar") filename = path + "/" + "h2o-genmodel.jar" response = urlopen()(url) with open(filename, "wb") as f: f.write(response.read())
def __init__(self, date=None, n_training_days=20, validation=True): ## initialize h2o if not already running if h2o.connection() is None: h2o.init() ## default value of date is today if date is None: date = str(dt.now()).split()[0] ## validation frame self.valid=None if validation: self.valid = h2o.import_file('data/rotoguru-'+date+'.csv') ## make list of csv files and import as training train_files = [] self.dt_gameday = dt.strptime(date, '%Y-%m-%d') for it in range(1,n_training_days+1): tmp_date = str((self.dt_gameday-td(days=it))).split()[0] train_files.append('data/rotoguru-'+tmp_date+'.csv') self.train = h2o.import_file(train_files) ## output variable self.response = 'Fan Points' ## recordclass for vars_dict self.Var = recordclass('Var', 'include is_cat') V = self.Var ## for readability/to avoid overflow self.var_dict = defaultdict(lambda: V(0,1), { ## rotoguru 'Date': V(0,1), 'GID': V(0,1), 'Pos': V(1,1), 'Name': V(1,1), 'Starter': V(0,1), 'FD Pts': V(0,0), 'FD Salary': V(1,0), 'Team': V(1,1), 'H/A': V(1,1), 'Oppt': V(1,1), 'Team Score': V(0,0), 'Oppt Score':V(0,0), 'Minutes': V(0,0), 'Stat line': V(0,1), ## derived 'Fan Points': V(0,0), 'Assists': V(0,0), 'Rebounds': V(0,0), 'Blocks': V(0,0), 'Points': V(0,0), 'Steals': V(0,0), 'Turnovers': V(0,0), })
def check_strict(): # We may be either connected to an existing h2o server, or not. If we are, then discover the connection settings # so that we don't have to start a new server (starting a new server may be not possible if h2o.jar is located in # some unknown to us place in the system). hc = h2o.connection() url = None if hc is not None: url = hc.base_url out = {"version_check_called": False} def tracefunc(frame, event, arg): if frame.f_code.co_name == "version_check": out["version_check_called"] = True return None sys.settrace(tracefunc) try: h2o.init(url=url) except H2OConnectionError: pass assert out["version_check_called"], \ "Strict version checking got turned off! TURN IT BACK ON NOW YOU JERK!"
def predict(self, test_data, predict_proba = False, pred_class_and_proba = False): """ Use pred_class_and_proba to produce both predicted probabilities and predicted classes. If this is regression problem, predict_proba and pred_class_and_proba are disregarded. Label column should not be present in test_data. Returns: Tuple (y_pred, y_prob, inference_time) where any element may be None. y_prob is a 2D numpy array of predicted probabilities, where each column represents a class. The ith column represents the class found via: self.classes[i] """ h2o_model = self.model if self.problem_type == REGRESSION: pred_class_and_proba = False predict_proba = False y_pred = None y_prob = None t0 = time.time() test = h2o.H2OFrame(test_data) preds_df = h2o_model.predict(test).as_data_frame(use_pandas=True) t1 = time.time() predict_time = t1 - t0 if self.problem_type is not REGRESSION: self.classes = preds_df.columns.tolist()[1:] if self.problem_type in [BINARY, MULTICLASS]: self.classes = self.remove_label_prefix_class(self.classes) if (not predict_proba) or pred_class_and_proba: y_pred = preds_df.iloc[:, 0] # print(y_pred[:5]) if self.problem_type in [BINARY, MULTICLASS]: y_pred = pd.Series(self.remove_label_prefix_class(list(y_pred.values)), index=y_pred.index) # print(y_pred[:5]) if predict_proba or pred_class_and_proba: y_prob = preds_df.iloc[:, 1:].values # Shutdown H2O before returning value: if h2o.connection(): h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown() return (y_pred, y_prob, predict_time)
def test_auto_recovery(self): name_node = pyunit_utils.hadoop_namenode() dataset = "/datasets/iris_wheader.csv" ntrees_opts = [100, 120, 130, 140] learn_rate_opts = [0.01, 0.02, 0.03, 0.04] grid_size = len(ntrees_opts) * len(learn_rate_opts) print("max models %s" % grid_size) grid_id = "grid_ft_auto_recover" hyper_parameters = { "learn_rate": learn_rate_opts, "ntrees": ntrees_opts } cluster_1_name = "grid-auto-1-py" try: cluster_1 = utils.start_cluster(cluster_1_name, enable_auto_recovery=True, clean_auto_recovery=True) print("initial cluster started at %s" % cluster_1) h2o.connect(url=cluster_1) train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset)) grid = H2OGridSearch(H2OGradientBoostingEstimator, grid_id=grid_id, hyper_params=hyper_parameters) bg_train_thread = threading.Thread(target=self._training_thread, kwargs={ "grid": grid, "train": train }) bg_train_thread.start() phase_1_models = self._wait_for_model_to_build(grid_id) self._print_models("Initial models", phase_1_models) assert len(phase_1_models) > 0 self._check_training_error() finally: utils.stop_cluster(cluster_1_name) cluster_2_name = "grid-auto-2-py" try: cluster_2 = utils.start_cluster(cluster_2_name, enable_auto_recovery=True) print("cluster resumed at %s, should unblock background thread" % cluster_2) phase_2_models = self._wait_for_model_to_build( grid_id, len(phase_1_models) + 1) self._print_models("Recovery #1 models", phase_2_models) assert len(phase_2_models) > len(phase_1_models) self._check_training_error() finally: utils.stop_cluster(cluster_2_name) cluster_3_name = "grid-auto-3-py" try: cluster_3 = utils.start_cluster(cluster_3_name, enable_auto_recovery=True) print("cluster resumed at %s, waiting for training to finish" % cluster_3) bg_train_thread.join() print("models after final run:") for x in sorted(grid.model_ids): print(x) print("Finished grained grid has %d models" % len(grid.model_ids)) self.assertEqual(len(grid.model_ids), grid_size, "The full grid was not trained.") self._check_training_error() h2o.connection().close() finally: utils.stop_cluster(cluster_3_name)
def date_munge(): crimes_path = pyunit_utils.locate("smalldata/chicago/chicagoCrimes10k.csv.zip") # crimes_path = "smalldata/chicago/chicagoCrimes10k.csv.zip" hc = h2o.connection() tmps0 = pyunit_utils.temp_ctr() # GET /3/ImportFiles # POST /3/ParseSetup # POST /3/Parse # GET /3/Job/{job_id} (multiple times) # GET /3/Frames/crimes crimes = h2o.import_file(path=crimes_path, destination_frame="crimes") rest1 = hc.requests_count crimes["Day"] = crimes["Date"].day() crimes["Month"] = crimes["Date"].month() + 1 # Since H2O indexes from 0 crimes["Year"] = crimes["Date"].year() + 1900 # Start of epoch is 1900 crimes["WeekNum"] = crimes["Date"].week() crimes["WeekDay"] = crimes["Date"].dayOfWeek() crimes["HourOfDay"] = crimes["Date"].hour() print("# of REST calls used: %d" % (hc.requests_count - rest1)) crimes["Weekend"] = (crimes["WeekDay"] == "Sun") | (crimes["WeekDay"] == "Sat") print("# of REST calls used: %d" % (hc.requests_count - rest1)) crimes["Season"] = crimes["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"]) print("# of REST calls used: %d" % (hc.requests_count - rest1)) crimes = crimes.drop("Date") print("# of REST calls used: %d" % (hc.requests_count - rest1)) # POST /4/sessions # POST /99/Rapids {ast:(tmp= py8 (cols (append # (tmp= py7 (append # (tmp= py6 (append # (tmp= py5 (append # (tmp= py4 (append # (tmp= py3 (:= # (tmp= py2 (append # (tmp= py1 (append crimes (day (cols_py chicagoCrimes10k.hex "Date")) "Day") # ) (+ (month (cols_py py1 "Date")) 1) "Month")) # (+ (year (cols_py py2 "Date")) 1900) 17 [])) # (week (cols_py py3 "Date")) "WeekNum")) # (dayOfWeek (cols_py py4 "Date")) "WeekDay")) # (hour (cols_py py5 "Date")) "HourOfDay")) # (| (== (cols_py py6 "WeekDay") "Sun") # (== (cols_py py6 "WeekDay") "Sat")) "Weekend")) # (cut (cols_py py7 "Month") [0 2 5 7 10 12] # ["Winter" "Spring" "Summer" "Autumn" "Winter"] FALSE TRUE 3) "Season") -3))} # GET /3/Frames/py8 crimes.describe() print("# of REST calls used: %d" % (hc.requests_count - rest1)) ntmps = pyunit_utils.temp_ctr() - tmps0 nrest = pyunit_utils.rest_ctr() - rest1 print("Number of temps used: %d" % ntmps) print("Number of RESTs used: %d" % nrest) assert ntmps == 8 assert nrest == 3
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads, config.max_mem_size_mb) h2o.init( nthreads=nthreads, min_mem_size=str(config.max_mem_size_mb) + "M", max_mem_size=str(config.max_mem_size_mb) + "M", # log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold)) ) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path, destination_frame=frame_name('train', config)) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path, destination_frame=frame_name('test', config)) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **training_params) with Timer() as training: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") save_predictions(aml, test, dataset=dataset, config=config) save_artifacts(aml, dataset=dataset, config=config) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', r2='r2', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) jvm_memory = str( round(config.max_mem_size_mb * 2 / 3)) + "M" # leaving 1/3rd of available memory for XGBoost log.info("Starting H2O cluster with %s cores, %s memory.", nthreads, jvm_memory) max_port_range = 49151 min_port_range = 1024 rnd_port = os.getpid() % (max_port_range - min_port_range) + min_port_range port = config.framework_params.get('_port', rnd_port) h2o.init( nthreads=nthreads, port=port, min_mem_size=jvm_memory, max_mem_size=jvm_memory, strict_version_check=config.framework_params.get( '_strict_version_check', True) # log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold)) ) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path, destination_frame=frame_name('train', config)) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path, destination_frame=frame_name('test', config)) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML( max_runtime_secs=config.max_runtime_seconds, max_runtime_secs_per_model=round( config.max_runtime_seconds / 2), # to prevent timeout on ensembles sort_metric=sort_metric, seed=config.seed, **training_params) monitor = ( BackendMemoryMonitoring( frequency_seconds=rconfig().monitoring.frequency_seconds, check_on_exit=True, verbosity=rconfig().monitoring.verbosity) if config.framework_params.get('_monitor_backend', False) # else contextlib.nullcontext # Py 3.7+ only else contextlib.contextmanager(iter)([0])) with Timer() as training: with monitor: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError( "H2O could not produce any model in the requested time.") save_predictions(aml, test, dataset=dataset, config=config) save_artifacts(aml, dataset=dataset, config=config) return dict(models_count=len(aml.leaderboard), training_duration=training.duration) finally: if h2o.connection(): # h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()
def run(dataset: Dataset, config: TaskConfig): log.info("\n**** H2O AutoML ****\n") # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict( acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', rmse='rmse', rmsle='rmsle' ) sort_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} nthreads = config.framework_params.get('_nthreads', config.cores) log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads, config.max_mem_size_mb) h2o.init(nthreads=nthreads, min_mem_size=str(config.max_mem_size_mb)+"M", max_mem_size=str(config.max_mem_size_mb)+"M", log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))) # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = h2o.import_file(dataset.train.path) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug("Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **training_params) with Timer() as training: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise NoResultError("H2O could not produce any model in the requested time.") lb = aml.leaderboard.as_data_frame() log.debug("Leaderboard:\n%s", lb.to_string()) lbf = split_path(config.output_predictions_file) lbf.extension = '.leaderboard.csv' lbf = path_from_split(lbf) write_csv(lb, lbf) h2o_preds = aml.predict(test).as_data_frame(use_pandas=False) preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0]) y_pred = preds.iloc[:, 0] h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False, header=False) y_truth = to_data_frame(h2o_truth) predictions = y_pred.values probabilities = preds.iloc[:, 1:].values truth = y_truth.values save_predictions_to_file(dataset=dataset, output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=truth) return dict( models_count=len(aml.leaderboard), training_duration=training.duration ) finally: if h2o.connection(): h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()
def h2oinitfunc(self): h2o.init() if str(h2o.connection())=="<H2OConnection to http://localhost:54321, no session>": self.lineEdit_PM_h2o_response.setText("Connection to H2O cluster Successful") else: self.lineEdit_PM_h2o_response.setText("Connection to H2O cluster Failed")
def date_munge(): crimes_path = pyunit_utils.locate( "smalldata/chicago/chicagoCrimes10k.csv.zip") # crimes_path = "smalldata/chicago/chicagoCrimes10k.csv.zip" hc = h2o.connection() tmps0 = pyunit_utils.temp_ctr() # GET /3/ImportFiles # POST /3/ParseSetup # POST /3/Parse # GET /3/Job/{job_id} (multiple times) # GET /3/Frames/crimes crimes = h2o.import_file(path=crimes_path, destination_frame="crimes") rest1 = hc.requests_count crimes["Day"] = crimes["Date"].day() crimes["Month"] = crimes["Date"].month() + 1 # Since H2O indexes from 0 crimes["Year"] = crimes["Date"].year() + 1900 # Start of epoch is 1900 crimes["WeekNum"] = crimes["Date"].week() crimes["WeekDay"] = crimes["Date"].dayOfWeek() crimes["HourOfDay"] = crimes["Date"].hour() print("# of REST calls used: %d" % (hc.requests_count - rest1)) crimes["Weekend"] = (crimes["WeekDay"] == "Sun") | (crimes["WeekDay"] == "Sat") print("# of REST calls used: %d" % (hc.requests_count - rest1)) crimes["Season"] = crimes["Month"].cut( [0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"]) print("# of REST calls used: %d" % (hc.requests_count - rest1)) crimes = crimes.drop("Date") print("# of REST calls used: %d" % (hc.requests_count - rest1)) # POST /4/sessions # POST /99/Rapids {ast:(tmp= py8 (cols (append # (tmp= py7 (append # (tmp= py6 (append # (tmp= py5 (append # (tmp= py4 (append # (tmp= py3 (:= # (tmp= py2 (append # (tmp= py1 (append crimes (day (cols_py chicagoCrimes10k.hex "Date")) "Day") # ) (+ (month (cols_py py1 "Date")) 1) "Month")) # (+ (year (cols_py py2 "Date")) 1900) 17 [])) # (week (cols_py py3 "Date")) "WeekNum")) # (dayOfWeek (cols_py py4 "Date")) "WeekDay")) # (hour (cols_py py5 "Date")) "HourOfDay")) # (| (== (cols_py py6 "WeekDay") "Sun") # (== (cols_py py6 "WeekDay") "Sat")) "Weekend")) # (cut (cols_py py7 "Month") [0 2 5 7 10 12] # ["Winter" "Spring" "Summer" "Autumn" "Winter"] FALSE TRUE 3) "Season") -3))} # GET /3/Frames/py8 crimes.describe() print("# of REST calls used: %d" % (hc.requests_count - rest1)) ntmps = pyunit_utils.temp_ctr() - tmps0 nrest = pyunit_utils.rest_ctr() - rest1 print("Number of temps used: %d" % ntmps) print("Number of RESTs used: %d" % nrest) assert ntmps == 8 assert nrest == 3
def run(dataset, config): log.info(f"\n**** H2O AutoML [v{h2o.__version__}] ****\n") save_metadata(config, version=h2o.__version__) # Mapping of benchmark metrics to H2O metrics metrics_mapping = dict(acc='mean_per_class_error', auc='AUC', logloss='logloss', mae='mae', mse='mse', r2='r2', rmse='rmse', rmsle='rmsle') sort_metric = metrics_mapping[ config.metric] if config.metric in metrics_mapping else None if sort_metric is None: # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric) try: training_params = { k: v for k, v in config.framework_params.items() if not k.startswith('_') } nthreads = config.framework_params.get('_nthreads', config.cores) jvm_memory = str( round(config.max_mem_size_mb * 2 / 3)) + "M" # leaving 1/3rd of available memory for XGBoost log.info("Starting H2O cluster with %s cores, %s memory.", nthreads, jvm_memory) max_port_range = 49151 min_port_range = 1024 rnd_port = os.getpid() % (max_port_range - min_port_range) + min_port_range port = config.framework_params.get('_port', rnd_port) init_params = config.framework_params.get('_init', {}) if "logs" in config.framework_params.get('_save_artifacts', []): init_params['ice_root'] = output_subdir("logs", config) h2o.init(nthreads=nthreads, port=port, min_mem_size=jvm_memory, max_mem_size=jvm_memory, **init_params) import_kwargs = {} # Load train as an H2O Frame, but test as a Pandas DataFrame log.debug("Loading train data from %s.", dataset.train.path) train = None if version.parse(h2o.__version__) >= version.parse( "3.32.0.3" ): # previous versions may fail to parse correctly some rare arff files using single quotes as enum/string delimiters (pandas also fails on same datasets) import_kwargs['quotechar'] = '"' train = h2o.import_file(dataset.train.path, destination_frame=frame_name( 'train', config), **import_kwargs) if not verify_loaded_frame(train, dataset): h2o.remove(train) train = None import_kwargs['quotechar'] = "'" if not train: train = h2o.import_file(dataset.train.path, destination_frame=frame_name( 'train', config), **import_kwargs) # train.impute(method='mean') log.debug("Loading test data from %s.", dataset.test.path) test = h2o.import_file(dataset.test.path, destination_frame=frame_name('test', config), **import_kwargs) # test.impute(method='mean') log.info("Running model on task %s, fold %s.", config.name, config.fold) log.debug( "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.", config.max_runtime_seconds, config.cores, sort_metric) aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds, sort_metric=sort_metric, seed=config.seed, **training_params) monitor = ( BackendMemoryMonitoring( frequency_seconds=config.ext.monitoring.frequency_seconds, check_on_exit=True, verbosity=config.ext.monitoring.verbosity) if config.framework_params.get('_monitor_backend', False) # else contextlib.nullcontext # Py 3.7+ only else contextlib.contextmanager(iter)([0])) with utils.Timer() as training: with monitor: aml.train(y=dataset.target.index, training_frame=train) if not aml.leader: raise FrameworkError( "H2O could not produce any model in the requested time.") with utils.Timer() as predict: preds = aml.predict(test) preds = extract_preds(preds, test, dataset=dataset) save_artifacts(aml, dataset=dataset, config=config) return result(output_file=config.output_predictions_file, predictions=preds.predictions, truth=preds.truth, probabilities=preds.probabilities, probabilities_labels=preds.probabilities_labels, models_count=len(aml.leaderboard), training_duration=training.duration, predict_duration=predict.duration) finally: if h2o.connection(): # h2o.remove_all() h2o.connection().close() if h2o.connection().local_server: h2o.connection().local_server.shutdown()