Example #1
0
 def _get_cluster_status_info_values(self):
     if self._retrieved_at + self.REFRESH_INTERVAL < time.time():
         # Info is stale, need to refresh
         new_info = h2o.api("GET /3/Cloud")
         self._fill_from_h2ocluster(new_info)
     ncpus = sum(node["num_cpus"] for node in self.nodes)
     allowed_cpus = sum(node["cpus_allowed"] for node in self.nodes)
     free_mem = sum(node["free_mem"] for node in self.nodes)
     unhealthy_nodes = sum(not node["healthy"] for node in self.nodes)
     status = "locked" if self.locked else "accepting new members"
     if unhealthy_nodes == 0:
         status += ", healthy"
     else:
         status += ", %d nodes are not healthy" % unhealthy_nodes
     api_extensions = self.list_api_extensions()
     values = [
         get_human_readable_time(self.cloud_uptime_millis),
         self.cloud_internal_timezone, self.datafile_parser_timezone,
         self.version,
         "{} {}".format(self.build_age,
                        ("!!!" if self.build_too_old else "")),
         self.cloud_name, self.cloud_size,
         get_human_readable_bytes(free_mem), ncpus, allowed_cpus, status,
         h2o.connection().base_url,
         json.dumps(h2o.connection().proxy), self.internal_security_enabled,
         ', '.join(api_extensions),
         "%d.%d.%d %s" % tuple(sys.version_info[:4])
     ]
     return values
Example #2
0
 def stop(self, stopSparkContext=False):
     h2o.connection().close()
     scalaStopMethod = getattr(self._jhc,
                               "ai$h2o$sparkling$H2OContext$$stop")
     scalaStopMethod(
         stopSparkContext, False, False
     )  # stopSpark = False, stopJVM = False, inShutdownHook = False
 def stop(self):
     h2o.connection().close()
     scalaStopMethod = getattr(self._jhc,
                               "org$apache$spark$h2o$H2OContext$$stop")
     scalaStopMethod(
         False, False, False
     )  # stopSpark = False, stopJVM = False, inShutdownHook = False
Example #4
0
    def test_frame_reload(self):
        name_node = pyunit_utils.hadoop_namenode()
        work_dir = "hdfs://%s%s" % (name_node, utils.get_workdir())
        dataset = "/datasets/iris_wheader.csv"

        ntrees_opts = [100, 120, 130, 140]
        learn_rate_opts = [0.01, 0.02, 0.03, 0.04]
        grid_size = len(ntrees_opts) * len(learn_rate_opts)
        print("max models %s" % grid_size)
        grid_id = "grid_ft_resume"
        hyper_parameters = {
            "learn_rate": learn_rate_opts,
            "ntrees": ntrees_opts
        }

        cluster_1_name = "grid1-py"
        try:
            cluster_1 = utils.start_cluster(cluster_1_name)
            h2o.connect(url=cluster_1)
            train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset))
            grid = H2OGridSearch(H2OGradientBoostingEstimator,
                                 grid_id=grid_id,
                                 hyper_params=hyper_parameters,
                                 recovery_dir=work_dir)
            print("starting initial grid and sleeping...")
            grid.start(x=list(range(4)), y=4, training_frame=train)
            grid_in_progress = None
            times_waited = 0
            while (times_waited < 20) and (grid_in_progress is None or len(
                    grid_in_progress.model_ids) == 0):
                time.sleep(5)  # give it tome to train some models
                times_waited += 1
                try:
                    grid_in_progress = h2o.get_grid(grid_id)
                except IndexError:
                    print("no models trained yet")
            print("done sleeping")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_1_name)

        cluster_2_name = "grid2-py"
        try:
            cluster_2 = utils.start_cluster(cluster_2_name)
            h2o.connect(url=cluster_2)
            loaded = h2o.load_grid("%s/%s" % (work_dir, grid_id),
                                   load_params_references=True)
            print("models after first run:")
            for x in sorted(loaded.model_ids):
                print(x)
            loaded.resume()
            print("models after second run:")
            for x in sorted(loaded.model_ids):
                print(x)
            print("Newly grained grid has %d models" % len(loaded.model_ids))
            self.assertEqual(len(loaded.model_ids), grid_size,
                             "The full grid was not trained.")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_2_name)
Example #5
0
    def test_frame_reload(self):
        name_node = utils.hadoop_namenode()
        work_dir = utils.get_workdir()
        dataset = "/datasets/mnist/train.csv.gz"

        saver_cluster_name = "saver-py"
        try:
            cluster_1 = utils.start_cluster(saver_cluster_name)
            h2o.connect(url=cluster_1)
            df_orig = h2o.import_file(path="hdfs://%s%s" %
                                      (name_node, dataset))
            df_key = df_orig.key
            df_pd_orig = df_orig.as_data_frame()
            df_orig.save(work_dir)
            h2o.connection().close()
        finally:
            utils.stop_cluster(saver_cluster_name)

        loader_cluster_name = "loader-py"
        try:
            cluster_2 = utils.start_cluster(loader_cluster_name)
            h2o.connect(url=cluster_2)
            df_loaded = h2o.load_frame(df_key, work_dir)
            df_pd_loaded = df_loaded.as_data_frame()
            h2o.connection().close()
        finally:
            utils.stop_cluster(loader_cluster_name)

        self.assertTrue(df_pd_orig.equals(df_pd_loaded))
Example #6
0
    def stop(self):
        h2o.connection().close()
        hc = self._jhc.h2oContext()
        scalaStopMethod = getattr(hc, "org$apache$spark$h2o$H2OContext$$stop")
        scalaStopMethod(
            False, False, False
        )  # stopSpark = False, stopJVM = False, inShutdownHook = False

        if self._conf.get("spark.ext.h2o.rest.api.based.client",
                          "false") == "false":
            sys.exit()
Example #7
0
def test_isna():
    nan = float("nan")
    frame = h2o.H2OFrame.from_python(OrderedDict([
        ("A", [1, 0, 3, 4, 8, 4, 7]), ("B", [2, nan, -1, nan, nan, 9, 0]),
        ("C", ["one", "", "two", "", "seventeen", "1", ""]),
        ("D", ["oneteen", "", "twoteen", "", "sixteen", "twenteen", ""])
    ]),
                                     na_strings=[""],
                                     column_types={
                                         "C": "enum",
                                         "D": "string"
                                     })

    assert frame.shape == (7, 4)
    assert frame.names == ["A", "B", "C", "D"]
    assert frame.types == {
        "A": "int",
        "B": "int",
        "C": "enum",
        "D": "string"
    }, "Actual types: %r" % frame.types

    isna = frame.isna()
    rc = h2o.connection().requests_count
    assert isna.shape == (7, 4)
    assert isna.names == ["isNA(A)", "isNA(B)", "isNA(C)", "isNA(D)"]
    # at some point we'll switch to 'bool' column type
    assert isna.types == {"isNA(A)": "int", "isNA(B)": "int", "isNA(C)": "int", "isNA(D)": "int"}, \
        "Actual types: %r" % isna.types
    assert h2o.connection(
    ).requests_count == rc, "Frame isna should not be evaluated yet!"

    print()
    print(isna)

    assert isna.shape == (7, 4)
    assert isna.names == ["isNA(A)", "isNA(B)", "isNA(C)", "isNA(D)"]
    assert isna.types == {
        "isNA(A)": "int",
        "isNA(B)": "int",
        "isNA(C)": "int",
        "isNA(D)": "int"
    }

    df = isna.as_data_frame(use_pandas=False, header=False)
    assert df == [
        ["0", "0", "0", "0"],
        ["0", "1", "1", "1"],
        ["0", "0", "0", "0"],
        ["0", "1", "1", "1"],
        ["0", "1", "0", "0"],
        ["0", "0", "0", "0"],
        ["0", "0", "1", "1"],
    ]
Example #8
0
    def is_running(self):
        """
        Determine if the H2O cluster is running or not.

        :returns: True if the cluster is up; False otherwise
        """
        try:
            if h2o.connection().local_server and not h2o.connection().local_server.is_running(): return False
            h2o.api("GET /")
            return True
        except (H2OConnectionError, H2OServerError):
            return False
Example #9
0
    def is_running(self):
        """
        Determine if the H2O cluster is running or not.

        :returns: True if the cluster is up; False otherwise
        """
        try:
            if h2o.connection().local_server and not h2o.connection().local_server.is_running(): return False
            h2o.api("GET /")
            return True
        except (H2OConnectionError, H2OServerError):
            return False
Example #10
0
    def show_status(self, detailed=False):
        """
        Print current cluster status information.

        :param detailed: if True, then also print detailed information about each node.
        """
        if self._retrieved_at + self.REFRESH_INTERVAL < time.time():
            # Info is stale, need to refresh
            new_info = h2o.api("GET /3/Cloud")
            self._fill_from_h2ocluster(new_info)
        ncpus = sum(node["num_cpus"] for node in self.nodes)
        allowed_cpus = sum(node["cpus_allowed"] for node in self.nodes)
        free_mem = sum(node["free_mem"] for node in self.nodes)
        unhealthy_nodes = sum(not node["healthy"] for node in self.nodes)
        status = "locked" if self.locked else "accepting new members"
        if unhealthy_nodes == 0:
            status += ", healthy"
        else:
            status += ", %d nodes are not healthy" % unhealthy_nodes
        api_extensions = self.list_api_extensions()
        H2ODisplay([
            ["H2O cluster uptime:",        get_human_readable_time(self.cloud_uptime_millis)],
            ["H2O cluster timezone:",      self.cloud_internal_timezone],
            ["H2O data parsing timezone:", self.datafile_parser_timezone],
            ["H2O cluster version:",       self.version],
            ["H2O cluster version age:",   "{} {}".format(self.build_age, ("!!!" if self.build_too_old else ""))],
            ["H2O cluster name:",          self.cloud_name],
            ["H2O cluster total nodes:",   self.cloud_size],
            ["H2O cluster free memory:",   get_human_readable_bytes(free_mem)],
            ["H2O cluster total cores:",   str(ncpus)],
            ["H2O cluster allowed cores:", str(allowed_cpus)],
            ["H2O cluster status:",        status],
            ["H2O connection url:",        h2o.connection().base_url],
            ["H2O connection proxy:",      h2o.connection().proxy],
            ["H2O internal security:",     self.internal_security_enabled],
            ["H2O API Extensions:",        ', '.join(api_extensions)],
            ["Python version:",            "%d.%d.%d %s" % tuple(sys.version_info[:4])],
        ])

        if detailed:
            keys = ["h2o", "healthy", "last_ping", "num_cpus", "sys_load", "mem_value_size", "free_mem", "pojo_mem",
                    "swap_mem", "free_disk", "max_disk", "pid", "num_keys", "tcps_active", "open_fds", "rpcs_active"]
            header = ["Nodes info:"] + ["Node %d" % (i + 1) for i in range(len(self.nodes))]
            table = [[k] for k in keys]
            for node in self.nodes:
                for i, k in enumerate(keys):
                    table[i].append(node[k])
            H2ODisplay(table=table, header=header)
Example #11
0
def h2oconnection():
    """
    Python API test: h2o.connection()
    """
    # call with no arguments
    temp = h2o.connection()
    assert_is_type(temp, H2OConnection)
Example #12
0
def redirect_relative():
    if os.environ.get('JOB_NAME') and os.environ.get('JOB_NAME').startswith(
            "h2o-3-kerberos-smoke-pipeline/"):
        h2o.log_and_echo(
            "Skipping test 'redirect_relative' on Kerberos pipeline (it is not configured with form_auth)"
        )
        return

    conn = h2o.connection()

    # get default requests arguments
    req_args = conn._request_args()
    headers = req_args["headers"]
    headers["User-Agent"] = "Mozilla/pyunit"

    # invalidate authentication
    req_args["auth"] = None
    req_args["headers"] = headers

    response_flow = requests.request("GET",
                                     conn._base_url + "/flow/index.html",
                                     allow_redirects=False,
                                     **req_args)
    print(response_flow)
    assert response_flow.status_code in [302, 303]
    assert response_flow.headers["location"].startswith("/login")
Example #13
0
    def _get_ast_str(self, top):
        if not self._cache.is_empty(
        ):  # Data already computed and cached; could a "false-like" cached value
            return str(self._cache._data) if self._cache.is_scalar(
            ) else self._cache._id
        if self._cache._id is not None:
            return self._cache._id  # Data already computed under ID, but not cached
        assert isinstance(self._children, tuple)
        exec_str = "({} {})".format(
            self._op,
            " ".join([ExprNode._arg_to_expr(ast) for ast in self._children]))

        def is_ast_expr(ref):
            return isinstance(ref, list) and any(
                map(lambda r: isinstance(r, ASTId), ref))

        referrers = gc.get_referrers(self)
        # removing frames from the referrers to get a consistent behaviour accross Py versions
        #  as stack frames don't appear in the referrers from Py 3.7.
        # also removing the AST expressions built in astfun.py
        #  as they keep a reference to self if the lambda itself is using a free variable.
        proper_ref = [
            r for r in referrers if not (inspect.isframe(r) or is_ast_expr(r))
        ]
        ref_cnt = len(proper_ref)
        del referrers, proper_ref
        # if this self node is referenced by at least one other node (nested expr), then create a tmp frame
        if top or ref_cnt > 1:
            self._cache._id = _py_tmp_key(append=h2o.connection().session_id)
            exec_str = "(tmp= {} {})".format(self._cache._id, exec_str)
        return exec_str
Example #14
0
def h2oconnection():
    """
    Python API test: h2o.connection()
    """
    # call with no arguments
    temp = h2o.connection()
    assert_is_type(temp, H2OConnection)
Example #15
0
    def rapids(expr):
        """
        Execute a Rapids expression.

        :param expr: The rapids expression (ascii string).

        :returns: The JSON response (as a python dictionary) of the Rapids execution
        """
        return h2o.api("POST /99/Rapids", data={"ast": expr, "session_id": h2o.connection().session_id})
Example #16
0
    def rapids(expr):
        """
        Execute a Rapids expression.

        :param expr: The rapids expression (ascii string).

        :returns: The JSON response (as a python dictionary) of the Rapids execution
        """
        return h2o.api("POST /99/Rapids", data={"ast": expr, "session_id": h2o.connection().session_id})
Example #17
0
    def show_status(self, detailed=False):
        """
        Print current cluster status information.

        :param detailed: if True, then also print detailed information about each node.
        """
        if self._retrieved_at + self.REFRESH_INTERVAL < time.time():
            # Info is stale, need to refresh
            new_info = h2o.api("GET /3/Cloud")
            self._fill_from_h2ocluster(new_info)
        ncpus = sum(node["num_cpus"] for node in self.nodes)
        allowed_cpus = sum(node["cpus_allowed"] for node in self.nodes)
        free_mem = sum(node["free_mem"] for node in self.nodes)
        unhealthy_nodes = sum(not node["healthy"] for node in self.nodes)
        status = "locked" if self.locked else "accepting new members"
        if unhealthy_nodes == 0:
            status += ", healthy"
        else:
            status += ", %d nodes are not healthy" % unhealthy_nodes
        H2ODisplay([
            ["H2O cluster uptime:",        get_human_readable_time(self.cloud_uptime_millis)],
            ["H2O cluster version:",       self.version],
            ["H2O cluster version age:",   "{} {}".format(self.build_age, ("!!!" if self.build_too_old else ""))],
            ["H2O cluster name:",          self.cloud_name],
            ["H2O cluster total nodes:",   self.cloud_size],
            ["H2O cluster free memory:",   get_human_readable_bytes(free_mem)],
            ["H2O cluster total cores:",   str(ncpus)],
            ["H2O cluster allowed cores:", str(allowed_cpus)],
            ["H2O cluster status:",        status],
            ["H2O connection url:",        h2o.connection().base_url],
            ["H2O connection proxy:",      h2o.connection().proxy],
            ["H2O internal security:",     self.internal_security_enabled],
            ["Python version:",            "%d.%d.%d %s" % tuple(sys.version_info[:4])],
        ])

        if detailed:
            keys = ["h2o", "healthy", "last_ping", "num_cpus", "sys_load", "mem_value_size", "free_mem", "pojo_mem",
                    "swap_mem", "free_disk", "max_disk", "pid", "num_keys", "tcps_active", "open_fds", "rpcs_active"]
            header = ["Nodes info:"] + ["Node %d" % (i + 1) for i in range(len(self.nodes))]
            table = [[k] for k in keys]
            for node in self.nodes:
                for i, k in enumerate(keys):
                    table[i].append(node[k])
            H2ODisplay(table=table, header=header)
Example #18
0
def h2oconnection():
    """
    Python API test: h2o.connection()
    """
    # call with no arguments
    try:
        temp = h2o.connection()
        assert_is_type(temp, H2OConnection)
    except Exception as e:
        assert False, "h2o.connection() command is not working."
Example #19
0
    def shutdown(self, prompt=False):
        """
        Shut down the server.

        This method checks if the H2O cluster is still running, and if it does shuts it down (via a REST API call).

        :param prompt: A logical value indicating whether to prompt the user before shutting down the H2O server.
        """
        if not self.is_running(): return
        assert_is_type(prompt, bool)
        if prompt:
            question = "Are you sure you want to shutdown the H2O instance running at %s (Y/N)? " \
                       % h2o.connection().base_url
            response = input(question)  # works in Py2 & Py3 because redefined in h2o.utils.compatibility module
        else:
            response = "Y"
        if response.lower() in {"y", "yes"}:
            h2o.api("POST /3/Shutdown")
            h2o.connection().close()
Example #20
0
    def shutdown(self, prompt=False):
        """
        Shut down the server.

        This method checks if the H2O cluster is still running, and if it does shuts it down (via a REST API call).

        :param prompt: A logical value indicating whether to prompt the user before shutting down the H2O server.
        """
        if not self.is_running(): return
        assert_is_type(prompt, bool)
        if prompt:
            question = "Are you sure you want to shutdown the H2O instance running at %s (Y/N)? " \
                       % h2o.connection().base_url
            response = input(question)  # works in Py2 & Py3 because redefined in h2o.utils.compatibility module
        else:
            response = "Y"
        if response.lower() in {"y", "yes"}:
            h2o.api("POST /3/Shutdown")
            h2o.connection().close()
Example #21
0
 def _do_it(self, top):
     if not self._cache.is_empty():  # Data already computed and cached; could a "false-like" cached value
         return str(self._cache._data) if self._cache.is_scalar() else self._cache._id
     if self._cache._id is not None: return self._cache._id  # Data already computed under ID, but not cached
     # assert isinstance(self._children,tuple)
     exec_str = "({} {})".format(self._op, " ".join([ExprNode._arg_to_expr(ast) for ast in self._children]))
     gc_ref_cnt = len(gc.get_referrers(self))
     if top or gc_ref_cnt >= ExprNode.MAGIC_REF_COUNT:
         self._cache._id = _py_tmp_key(append=h2o.connection().session_id)
         exec_str = "(tmp= {} {})".format(self._cache._id, exec_str)
     return exec_str
Example #22
0
 def _get_ast_str(self, top):
     if not self._cache.is_empty():  # Data already computed and cached; could a "false-like" cached value
         return str(self._cache._data) if self._cache.is_scalar() else self._cache._id
     if self._cache._id is not None:
         return self._cache._id  # Data already computed under ID, but not cached
     # assert isinstance(self._children,tuple)
     exec_str = "({} {})".format(self._op, " ".join([ExprNode._arg_to_expr(ast) for ast in self._children]))
     gc_ref_cnt = len(gc.get_referrers(self))
     if top or gc_ref_cnt >= ExprNode.MAGIC_REF_COUNT:
         self._cache._id = _py_tmp_key(append=h2o.connection().session_id)
         exec_str = "(tmp= {} {})".format(self._cache._id, exec_str)
     return exec_str
Example #23
0
def test_isna():
    nan = float("nan")
    frame = h2o.H2OFrame.from_python(OrderedDict([
        ("A", [1, 0, 3, 4, 8, 4, 7]),
        ("B", [2, nan, -1, nan, nan, 9, 0]),
        ("C", ["one", "", "two", "", "seventeen", "1", ""]),
        ("D", ["oneteen", "", "twoteen", "", "sixteen", "twenteen", ""])
    ]), na_strings=[""], column_types={"C": "enum", "D": "string"})

    assert frame.shape == (7, 4)
    assert frame.names == ["A", "B", "C", "D"]
    assert frame.types == {"A": "int", "B": "int", "C": "enum", "D": "string"}, "Actual types: %r" % frame.types

    isna = frame.isna()
    rc = h2o.connection().requests_count
    assert isna.shape == (7, 4)
    assert isna.names == ["isNA(A)", "isNA(B)", "isNA(C)", "isNA(D)"]
    # at some point we'll switch to 'bool' column type
    assert isna.types == {"isNA(A)": "int", "isNA(B)": "int", "isNA(C)": "int", "isNA(D)": "int"}, \
        "Actual types: %r" % isna.types
    assert h2o.connection().requests_count == rc, "Frame isna should not be evaluated yet!"

    print()
    print(isna)

    assert isna.shape == (7, 4)
    assert isna.names == ["isNA(A)", "isNA(B)", "isNA(C)", "isNA(D)"]
    assert isna.types == {"isNA(A)": "int", "isNA(B)": "int", "isNA(C)": "int", "isNA(D)": "int"}

    df = isna.as_data_frame(use_pandas=False, header=False)
    assert df == [
        ["0", "0", "0", "0"],
        ["0", "1", "1", "1"],
        ["0", "0", "0", "0"],
        ["0", "1", "1", "1"],
        ["0", "1", "0", "0"],
        ["0", "0", "0", "0"],
        ["0", "0", "1", "1"],
    ]
Example #24
0
def connect_invalid():
    current = h2o.connection()

    print(current._auth)

    if current._auth is None:
        print("Skipping test running in non-authenticated environment")
        return

    invalid = copy.copy(current)

    # Not invalid yet - first do sanity check that original connection can be used to make a request
    invalid.request("GET /3/About")

    auth_user = invalid._auth[0] if isinstance(
        invalid._auth, tuple) else "jenkins"  # fallback for CI testing
    invalid._auth = (auth_user, "invalid-password")

    # Test with invalid basic auth
    err = None
    try:
        invalid._auth = ("invalid-user", "invalid-password")
        invalid.request("GET /3/About")
    except H2OServerError as e:
        err = e

    assert err is not None

    msg = str(err.args[0])
    print("<Error message>")
    print(msg)
    print("</Error Message>")

    assert msg.startswith("HTTP 401")  # Unauthorized

    # Test without any auth
    err = None
    try:
        invalid._auth = None
        invalid.request("GET /3/About")
    except H2OServerError as e:
        err = e

    assert err is not None

    msg = str(err.args[0])
    print("<Error message>")
    print(msg)
    print("</Error Message>")

    assert msg.startswith("HTTP 401")  # Unauthorized
Example #25
0
def test_import_from_long_urls():
    prostate_path = pyunit_utils.locate("smalldata/logreg/prostate.csv")
    prostate = h2o.import_file(path=prostate_path)

    padding = "x" * 512

    # get the data from H2O to avoid making calls to public internet
    conn = h2o.connection()
    url = conn._base_url + "/3/DownloadDataset?frame_id=%s&hex_string=false&padding=" % prostate.frame_id + padding

    prostate_from_self = h2o.import_file(url)

    assert_frame_equal(prostate_from_self.as_data_frame(),
                       prostate.as_data_frame())
Example #26
0
 def to_pojo(self, pojo_name="", path="", get_jar=True):
     if pojo_name == "": pojo_name = "AssemblyPOJO_" + str(uuid.uuid4())
     java = h2o.api("GET /99/Assembly.java/%s/%s" % (self.id, pojo_name))
     file_path = path + "/" + pojo_name + ".java"
     if path == "":
         print(java)
     else:
         with open(file_path, 'w', encoding="utf-8") as f:
             f.write(java)  # this had better be utf-8 ?
     if get_jar and path != "":
         url = h2o.connection().make_url("h2o-genmodel.jar")
         filename = path + "/" + "h2o-genmodel.jar"
         response = urlopen()(url)
         with open(filename, "wb") as f:
             f.write(response.read())
Example #27
0
 def to_pojo(self, pojo_name="", path="", get_jar=True):
     if pojo_name == "": pojo_name = "AssemblyPOJO_" + str(uuid.uuid4())
     java = h2o.api("GET /99/Assembly.java/%s/%s" % (self.id, pojo_name))
     file_path = path + "/" + pojo_name + ".java"
     if path == "":
         print(java)
     else:
         with open(file_path, 'w', encoding="utf-8") as f:
             f.write(java)  # this had better be utf-8 ?
     if get_jar and path != "":
         url = h2o.connection().make_url("h2o-genmodel.jar")
         filename = path + "/" + "h2o-genmodel.jar"
         response = urlopen()(url)
         with open(filename, "wb") as f:
             f.write(response.read())
Example #28
0
    def __init__(self, date=None, n_training_days=20, validation=True):
    
        ## initialize h2o if not already running
        if h2o.connection() is None:
            h2o.init()
        
        ## default value of date is today
        if date is None:
            date = str(dt.now()).split()[0]
        
        ## validation frame
        self.valid=None
        if validation:
            self.valid = h2o.import_file('data/rotoguru-'+date+'.csv')

        ## make list of csv files and import as training
        train_files = []
        self.dt_gameday = dt.strptime(date, '%Y-%m-%d')
        for it in range(1,n_training_days+1):
            tmp_date = str((self.dt_gameday-td(days=it))).split()[0]
            train_files.append('data/rotoguru-'+tmp_date+'.csv')

        self.train = h2o.import_file(train_files)

        ## output variable
        self.response = 'Fan Points'

        ## recordclass for vars_dict
        self.Var = recordclass('Var', 'include is_cat')
        V = self.Var ## for readability/to avoid overflow

        self.var_dict = defaultdict(lambda: V(0,1), {
            ## rotoguru
            'Date':       V(0,1), 'GID':       V(0,1),
            'Pos':        V(1,1), 'Name':      V(1,1),
            'Starter':    V(0,1), 'FD Pts':    V(0,0),
            'FD Salary':  V(1,0), 'Team':      V(1,1),
            'H/A':        V(1,1), 'Oppt':      V(1,1),
            'Team Score': V(0,0), 'Oppt Score':V(0,0),
            'Minutes':    V(0,0), 'Stat line': V(0,1),
            ## derived
            'Fan Points': V(0,0), 
            'Assists':    V(0,0), 'Rebounds':  V(0,0),
            'Blocks':     V(0,0), 'Points':    V(0,0),
            'Steals':     V(0,0), 'Turnovers': V(0,0),
        })
Example #29
0
def check_strict():
    # We may be either connected to an existing h2o server, or not. If we are, then discover the connection settings
    # so that we don't have to start a new server (starting a new server may be not possible if h2o.jar is located in
    # some unknown to us place in the system).
    hc = h2o.connection()
    url = None
    if hc is not None:
        url = hc.base_url

    out = {"version_check_called": False}
    def tracefunc(frame, event, arg):
        if frame.f_code.co_name == "version_check":
            out["version_check_called"] = True
        return None
    sys.settrace(tracefunc)
    try:
        h2o.init(url=url)
    except H2OConnectionError:
        pass

    assert out["version_check_called"], \
        "Strict version checking got turned off! TURN IT BACK ON NOW YOU JERK!"
def check_strict():
    # We may be either connected to an existing h2o server, or not. If we are, then discover the connection settings
    # so that we don't have to start a new server (starting a new server may be not possible if h2o.jar is located in
    # some unknown to us place in the system).
    hc = h2o.connection()
    url = None
    if hc is not None:
        url = hc.base_url

    out = {"version_check_called": False}

    def tracefunc(frame, event, arg):
        if frame.f_code.co_name == "version_check":
            out["version_check_called"] = True
        return None

    sys.settrace(tracefunc)
    try:
        h2o.init(url=url)
    except H2OConnectionError:
        pass

    assert out["version_check_called"], \
        "Strict version checking got turned off! TURN IT BACK ON NOW YOU JERK!"
Example #31
0
    def predict(self, test_data, predict_proba = False, pred_class_and_proba = False):
        """ Use pred_class_and_proba to produce both predicted probabilities and predicted classes.
            If this is regression problem, predict_proba and pred_class_and_proba are disregarded.
            Label column should not be present in test_data.
            
            Returns: Tuple (y_pred, y_prob, inference_time) where any element may be None.
            y_prob is a 2D numpy array of predicted probabilities, where each column represents a class. The ith column represents the class found via: self.classes[i]
        """
        h2o_model = self.model
        if self.problem_type == REGRESSION:
            pred_class_and_proba = False
            predict_proba = False
        y_pred = None
        y_prob = None
        t0 = time.time()
        test = h2o.H2OFrame(test_data)
        preds_df = h2o_model.predict(test).as_data_frame(use_pandas=True)
        t1 = time.time()
        predict_time = t1 - t0
        if self.problem_type is not REGRESSION:
            self.classes = preds_df.columns.tolist()[1:]
            if self.problem_type in [BINARY, MULTICLASS]:
                self.classes = self.remove_label_prefix_class(self.classes)

        if (not predict_proba) or pred_class_and_proba:
            y_pred = preds_df.iloc[:, 0]
            # print(y_pred[:5])
            if self.problem_type in [BINARY, MULTICLASS]:
                y_pred = pd.Series(self.remove_label_prefix_class(list(y_pred.values)), index=y_pred.index)
            # print(y_pred[:5])

        if predict_proba or pred_class_and_proba:
            y_prob = preds_df.iloc[:, 1:].values
        
        # Shutdown H2O before returning value:
        if h2o.connection():
            h2o.remove_all()
            h2o.connection().close()
        if h2o.connection().local_server:
            h2o.connection().local_server.shutdown()
        return (y_pred, y_prob, predict_time)
    def test_auto_recovery(self):
        name_node = pyunit_utils.hadoop_namenode()
        dataset = "/datasets/iris_wheader.csv"

        ntrees_opts = [100, 120, 130, 140]
        learn_rate_opts = [0.01, 0.02, 0.03, 0.04]
        grid_size = len(ntrees_opts) * len(learn_rate_opts)
        print("max models %s" % grid_size)
        grid_id = "grid_ft_auto_recover"
        hyper_parameters = {
            "learn_rate": learn_rate_opts,
            "ntrees": ntrees_opts
        }

        cluster_1_name = "grid-auto-1-py"
        try:
            cluster_1 = utils.start_cluster(cluster_1_name,
                                            enable_auto_recovery=True,
                                            clean_auto_recovery=True)
            print("initial cluster started at %s" % cluster_1)
            h2o.connect(url=cluster_1)
            train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset))
            grid = H2OGridSearch(H2OGradientBoostingEstimator,
                                 grid_id=grid_id,
                                 hyper_params=hyper_parameters)
            bg_train_thread = threading.Thread(target=self._training_thread,
                                               kwargs={
                                                   "grid": grid,
                                                   "train": train
                                               })
            bg_train_thread.start()
            phase_1_models = self._wait_for_model_to_build(grid_id)
            self._print_models("Initial models", phase_1_models)
            assert len(phase_1_models) > 0
            self._check_training_error()
        finally:
            utils.stop_cluster(cluster_1_name)

        cluster_2_name = "grid-auto-2-py"
        try:
            cluster_2 = utils.start_cluster(cluster_2_name,
                                            enable_auto_recovery=True)
            print("cluster resumed at %s, should unblock background thread" %
                  cluster_2)
            phase_2_models = self._wait_for_model_to_build(
                grid_id,
                len(phase_1_models) + 1)
            self._print_models("Recovery #1 models", phase_2_models)
            assert len(phase_2_models) > len(phase_1_models)
            self._check_training_error()
        finally:
            utils.stop_cluster(cluster_2_name)

        cluster_3_name = "grid-auto-3-py"
        try:
            cluster_3 = utils.start_cluster(cluster_3_name,
                                            enable_auto_recovery=True)
            print("cluster resumed at %s, waiting for training to finish" %
                  cluster_3)
            bg_train_thread.join()
            print("models after final run:")
            for x in sorted(grid.model_ids):
                print(x)
            print("Finished grained grid has %d models" % len(grid.model_ids))
            self.assertEqual(len(grid.model_ids), grid_size,
                             "The full grid was not trained.")
            self._check_training_error()
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_3_name)
Example #33
0
def date_munge():
    crimes_path = pyunit_utils.locate("smalldata/chicago/chicagoCrimes10k.csv.zip")
    # crimes_path = "smalldata/chicago/chicagoCrimes10k.csv.zip"

    hc = h2o.connection()
    tmps0 = pyunit_utils.temp_ctr()

    # GET /3/ImportFiles
    # POST /3/ParseSetup
    # POST /3/Parse
    # GET /3/Job/{job_id}  (multiple times)
    # GET /3/Frames/crimes
    crimes = h2o.import_file(path=crimes_path, destination_frame="crimes")

    rest1 = hc.requests_count

    crimes["Day"] = crimes["Date"].day()
    crimes["Month"] = crimes["Date"].month() + 1    # Since H2O indexes from 0
    crimes["Year"] = crimes["Date"].year() + 1900  # Start of epoch is 1900
    crimes["WeekNum"] = crimes["Date"].week()
    crimes["WeekDay"] = crimes["Date"].dayOfWeek()
    crimes["HourOfDay"] = crimes["Date"].hour()
    print("# of REST calls used: %d" % (hc.requests_count - rest1))

    crimes["Weekend"] = (crimes["WeekDay"] == "Sun") | (crimes["WeekDay"] == "Sat")
    print("# of REST calls used: %d" % (hc.requests_count - rest1))

    crimes["Season"] = crimes["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
    print("# of REST calls used: %d" % (hc.requests_count - rest1))

    crimes = crimes.drop("Date")
    print("# of REST calls used: %d" % (hc.requests_count - rest1))

    # POST /4/sessions
    # POST /99/Rapids  {ast:(tmp= py8 (cols (append
    #                        (tmp= py7 (append
    #                         (tmp= py6 (append
    #                          (tmp= py5 (append
    #                           (tmp= py4 (append
    #                            (tmp= py3 (:=
    #                             (tmp= py2 (append
    #                              (tmp= py1 (append crimes (day (cols_py chicagoCrimes10k.hex "Date")) "Day")
    #                               ) (+ (month (cols_py py1 "Date")) 1) "Month"))
    #                                 (+ (year (cols_py py2 "Date")) 1900) 17 []))
    #                                    (week (cols_py py3 "Date")) "WeekNum"))
    #                                    (dayOfWeek (cols_py py4 "Date")) "WeekDay"))
    #                                    (hour (cols_py py5 "Date")) "HourOfDay"))
    #                                 (| (== (cols_py py6 "WeekDay") "Sun")
    #                                    (== (cols_py py6 "WeekDay") "Sat")) "Weekend"))
    #                         (cut (cols_py py7 "Month") [0 2 5 7 10 12]
    #                           ["Winter" "Spring" "Summer" "Autumn" "Winter"] FALSE TRUE 3) "Season") -3))}
    # GET /3/Frames/py8
    crimes.describe()
    print("# of REST calls used: %d" % (hc.requests_count - rest1))

    ntmps = pyunit_utils.temp_ctr() - tmps0
    nrest = pyunit_utils.rest_ctr() - rest1
    print("Number of temps used: %d" % ntmps)
    print("Number of RESTs used: %d" % nrest)
    assert ntmps == 8
    assert nrest == 3
Example #34
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** H2O AutoML ****\n")
    # Mapping of benchmark metrics to H2O metrics
    metrics_mapping = dict(acc='mean_per_class_error',
                           auc='AUC',
                           logloss='logloss',
                           mae='mae',
                           mse='mse',
                           rmse='rmse',
                           rmsle='rmsle')
    sort_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if sort_metric is None:
        # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported, defaulting to AUTO.",
                    config.metric)

    try:
        training_params = {
            k: v
            for k, v in config.framework_params.items()
            if not k.startswith('_')
        }
        nthreads = config.framework_params.get('_nthreads', config.cores)

        log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads,
                 config.max_mem_size_mb)
        h2o.init(
            nthreads=nthreads,
            min_mem_size=str(config.max_mem_size_mb) + "M",
            max_mem_size=str(config.max_mem_size_mb) + "M",
            # log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))
        )

        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = h2o.import_file(dataset.train.path,
                                destination_frame=frame_name('train', config))
        # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path,
                               destination_frame=frame_name('test', config))
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name,
                 config.fold)
        log.debug(
            "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
            config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds,
                        sort_metric=sort_metric,
                        seed=config.seed,
                        **training_params)

        with Timer() as training:
            aml.train(y=dataset.target.index, training_frame=train)

        if not aml.leader:
            raise NoResultError(
                "H2O could not produce any model in the requested time.")

        save_predictions(aml, test, dataset=dataset, config=config)
        save_artifacts(aml, dataset=dataset, config=config)

        return dict(models_count=len(aml.leaderboard),
                    training_duration=training.duration)

    finally:
        if h2o.connection():
            h2o.remove_all()
            h2o.connection().close()
        if h2o.connection().local_server:
            h2o.connection().local_server.shutdown()
Example #35
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** H2O AutoML ****\n")
    # Mapping of benchmark metrics to H2O metrics
    metrics_mapping = dict(acc='mean_per_class_error',
                           auc='AUC',
                           logloss='logloss',
                           mae='mae',
                           mse='mse',
                           r2='r2',
                           rmse='rmse',
                           rmsle='rmsle')
    sort_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if sort_metric is None:
        # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported, defaulting to AUTO.",
                    config.metric)

    try:
        training_params = {
            k: v
            for k, v in config.framework_params.items()
            if not k.startswith('_')
        }
        nthreads = config.framework_params.get('_nthreads', config.cores)
        jvm_memory = str(
            round(config.max_mem_size_mb * 2 /
                  3)) + "M"  # leaving 1/3rd of available memory for XGBoost

        log.info("Starting H2O cluster with %s cores, %s memory.", nthreads,
                 jvm_memory)
        max_port_range = 49151
        min_port_range = 1024
        rnd_port = os.getpid() % (max_port_range -
                                  min_port_range) + min_port_range
        port = config.framework_params.get('_port', rnd_port)

        h2o.init(
            nthreads=nthreads,
            port=port,
            min_mem_size=jvm_memory,
            max_mem_size=jvm_memory,
            strict_version_check=config.framework_params.get(
                '_strict_version_check', True)
            # log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold))
        )

        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = h2o.import_file(dataset.train.path,
                                destination_frame=frame_name('train', config))
        # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path,
                               destination_frame=frame_name('test', config))
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name,
                 config.fold)
        log.debug(
            "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
            config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(
            max_runtime_secs=config.max_runtime_seconds,
            max_runtime_secs_per_model=round(
                config.max_runtime_seconds /
                2),  # to prevent timeout on ensembles
            sort_metric=sort_metric,
            seed=config.seed,
            **training_params)

        monitor = (
            BackendMemoryMonitoring(
                frequency_seconds=rconfig().monitoring.frequency_seconds,
                check_on_exit=True,
                verbosity=rconfig().monitoring.verbosity)
            if config.framework_params.get('_monitor_backend', False)
            # else contextlib.nullcontext  # Py 3.7+ only
            else contextlib.contextmanager(iter)([0]))
        with Timer() as training:
            with monitor:
                aml.train(y=dataset.target.index, training_frame=train)

        if not aml.leader:
            raise NoResultError(
                "H2O could not produce any model in the requested time.")

        save_predictions(aml, test, dataset=dataset, config=config)
        save_artifacts(aml, dataset=dataset, config=config)

        return dict(models_count=len(aml.leaderboard),
                    training_duration=training.duration)

    finally:
        if h2o.connection():
            # h2o.remove_all()
            h2o.connection().close()
        if h2o.connection().local_server:
            h2o.connection().local_server.shutdown()
Example #36
0
def run(dataset: Dataset, config: TaskConfig):
    log.info("\n**** H2O AutoML ****\n")
    # Mapping of benchmark metrics to H2O metrics
    metrics_mapping = dict(
        acc='mean_per_class_error',
        auc='AUC',
        logloss='logloss',
        mae='mae',
        mse='mse',
        rmse='rmse',
        rmsle='rmsle'
    )
    sort_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
    if sort_metric is None:
        # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported, defaulting to AUTO.", config.metric)

    try:
        training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
        nthreads = config.framework_params.get('_nthreads', config.cores)

        log.info("Starting H2O cluster with %s cores, %sMB memory.", nthreads, config.max_mem_size_mb)
        h2o.init(nthreads=nthreads,
                 min_mem_size=str(config.max_mem_size_mb)+"M",
                 max_mem_size=str(config.max_mem_size_mb)+"M",
                 log_dir=os.path.join(config.output_dir, 'logs', config.name, str(config.fold)))

        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = h2o.import_file(dataset.train.path)
        # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path)
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name, config.fold)
        log.debug("Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
                  config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds,
                        sort_metric=sort_metric,
                        seed=config.seed,
                        **training_params)

        with Timer() as training:
            aml.train(y=dataset.target.index, training_frame=train)

        if not aml.leader:
            raise NoResultError("H2O could not produce any model in the requested time.")

        lb = aml.leaderboard.as_data_frame()
        log.debug("Leaderboard:\n%s", lb.to_string())
        lbf = split_path(config.output_predictions_file)
        lbf.extension = '.leaderboard.csv'
        lbf = path_from_split(lbf)
        write_csv(lb, lbf)

        h2o_preds = aml.predict(test).as_data_frame(use_pandas=False)
        preds = to_data_frame(h2o_preds[1:], columns=h2o_preds[0])
        y_pred = preds.iloc[:, 0]

        h2o_truth = test[:, dataset.target.index].as_data_frame(use_pandas=False, header=False)
        y_truth = to_data_frame(h2o_truth)

        predictions = y_pred.values
        probabilities = preds.iloc[:, 1:].values
        truth = y_truth.values

        save_predictions_to_file(dataset=dataset,
                                 output_file=config.output_predictions_file,
                                 probabilities=probabilities,
                                 predictions=predictions,
                                 truth=truth)

        return dict(
            models_count=len(aml.leaderboard),
            training_duration=training.duration
        )

    finally:
        if h2o.connection():
            h2o.remove_all()
            h2o.connection().close()
        if h2o.connection().local_server:
            h2o.connection().local_server.shutdown()
Example #37
0
 def h2oinitfunc(self):
     h2o.init()
     if str(h2o.connection())=="<H2OConnection to http://localhost:54321, no session>":
         self.lineEdit_PM_h2o_response.setText("Connection to H2O cluster Successful")
     else:
         self.lineEdit_PM_h2o_response.setText("Connection to H2O cluster Failed")            
Example #38
0
def date_munge():
    crimes_path = pyunit_utils.locate(
        "smalldata/chicago/chicagoCrimes10k.csv.zip")
    # crimes_path = "smalldata/chicago/chicagoCrimes10k.csv.zip"

    hc = h2o.connection()
    tmps0 = pyunit_utils.temp_ctr()

    # GET /3/ImportFiles
    # POST /3/ParseSetup
    # POST /3/Parse
    # GET /3/Job/{job_id}  (multiple times)
    # GET /3/Frames/crimes
    crimes = h2o.import_file(path=crimes_path, destination_frame="crimes")

    rest1 = hc.requests_count

    crimes["Day"] = crimes["Date"].day()
    crimes["Month"] = crimes["Date"].month() + 1  # Since H2O indexes from 0
    crimes["Year"] = crimes["Date"].year() + 1900  # Start of epoch is 1900
    crimes["WeekNum"] = crimes["Date"].week()
    crimes["WeekDay"] = crimes["Date"].dayOfWeek()
    crimes["HourOfDay"] = crimes["Date"].hour()
    print("# of REST calls used: %d" % (hc.requests_count - rest1))

    crimes["Weekend"] = (crimes["WeekDay"] == "Sun") | (crimes["WeekDay"]
                                                        == "Sat")
    print("# of REST calls used: %d" % (hc.requests_count - rest1))

    crimes["Season"] = crimes["Month"].cut(
        [0, 2, 5, 7, 10, 12],
        ["Winter", "Spring", "Summer", "Autumn", "Winter"])
    print("# of REST calls used: %d" % (hc.requests_count - rest1))

    crimes = crimes.drop("Date")
    print("# of REST calls used: %d" % (hc.requests_count - rest1))

    # POST /4/sessions
    # POST /99/Rapids  {ast:(tmp= py8 (cols (append
    #                        (tmp= py7 (append
    #                         (tmp= py6 (append
    #                          (tmp= py5 (append
    #                           (tmp= py4 (append
    #                            (tmp= py3 (:=
    #                             (tmp= py2 (append
    #                              (tmp= py1 (append crimes (day (cols_py chicagoCrimes10k.hex "Date")) "Day")
    #                               ) (+ (month (cols_py py1 "Date")) 1) "Month"))
    #                                 (+ (year (cols_py py2 "Date")) 1900) 17 []))
    #                                    (week (cols_py py3 "Date")) "WeekNum"))
    #                                    (dayOfWeek (cols_py py4 "Date")) "WeekDay"))
    #                                    (hour (cols_py py5 "Date")) "HourOfDay"))
    #                                 (| (== (cols_py py6 "WeekDay") "Sun")
    #                                    (== (cols_py py6 "WeekDay") "Sat")) "Weekend"))
    #                         (cut (cols_py py7 "Month") [0 2 5 7 10 12]
    #                           ["Winter" "Spring" "Summer" "Autumn" "Winter"] FALSE TRUE 3) "Season") -3))}
    # GET /3/Frames/py8
    crimes.describe()
    print("# of REST calls used: %d" % (hc.requests_count - rest1))

    ntmps = pyunit_utils.temp_ctr() - tmps0
    nrest = pyunit_utils.rest_ctr() - rest1
    print("Number of temps used: %d" % ntmps)
    print("Number of RESTs used: %d" % nrest)
    assert ntmps == 8
    assert nrest == 3
Example #39
0
def run(dataset, config):
    log.info(f"\n**** H2O AutoML [v{h2o.__version__}] ****\n")
    save_metadata(config, version=h2o.__version__)
    # Mapping of benchmark metrics to H2O metrics
    metrics_mapping = dict(acc='mean_per_class_error',
                           auc='AUC',
                           logloss='logloss',
                           mae='mae',
                           mse='mse',
                           r2='r2',
                           rmse='rmse',
                           rmsle='rmsle')
    sort_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else None
    if sort_metric is None:
        # TODO: Figure out if we are going to blindly pass metrics through, or if we use a strict mapping
        log.warning("Performance metric %s not supported, defaulting to AUTO.",
                    config.metric)

    try:
        training_params = {
            k: v
            for k, v in config.framework_params.items()
            if not k.startswith('_')
        }
        nthreads = config.framework_params.get('_nthreads', config.cores)
        jvm_memory = str(
            round(config.max_mem_size_mb * 2 /
                  3)) + "M"  # leaving 1/3rd of available memory for XGBoost

        log.info("Starting H2O cluster with %s cores, %s memory.", nthreads,
                 jvm_memory)
        max_port_range = 49151
        min_port_range = 1024
        rnd_port = os.getpid() % (max_port_range -
                                  min_port_range) + min_port_range
        port = config.framework_params.get('_port', rnd_port)

        init_params = config.framework_params.get('_init', {})
        if "logs" in config.framework_params.get('_save_artifacts', []):
            init_params['ice_root'] = output_subdir("logs", config)

        h2o.init(nthreads=nthreads,
                 port=port,
                 min_mem_size=jvm_memory,
                 max_mem_size=jvm_memory,
                 **init_params)

        import_kwargs = {}
        # Load train as an H2O Frame, but test as a Pandas DataFrame
        log.debug("Loading train data from %s.", dataset.train.path)
        train = None
        if version.parse(h2o.__version__) >= version.parse(
                "3.32.0.3"
        ):  # previous versions may fail to parse correctly some rare arff files using single quotes as enum/string delimiters (pandas also fails on same datasets)
            import_kwargs['quotechar'] = '"'
            train = h2o.import_file(dataset.train.path,
                                    destination_frame=frame_name(
                                        'train', config),
                                    **import_kwargs)
            if not verify_loaded_frame(train, dataset):
                h2o.remove(train)
                train = None
                import_kwargs['quotechar'] = "'"

        if not train:
            train = h2o.import_file(dataset.train.path,
                                    destination_frame=frame_name(
                                        'train', config),
                                    **import_kwargs)
            # train.impute(method='mean')
        log.debug("Loading test data from %s.", dataset.test.path)
        test = h2o.import_file(dataset.test.path,
                               destination_frame=frame_name('test', config),
                               **import_kwargs)
        # test.impute(method='mean')

        log.info("Running model on task %s, fold %s.", config.name,
                 config.fold)
        log.debug(
            "Running H2O AutoML with a maximum time of %ss on %s core(s), optimizing %s.",
            config.max_runtime_seconds, config.cores, sort_metric)

        aml = H2OAutoML(max_runtime_secs=config.max_runtime_seconds,
                        sort_metric=sort_metric,
                        seed=config.seed,
                        **training_params)

        monitor = (
            BackendMemoryMonitoring(
                frequency_seconds=config.ext.monitoring.frequency_seconds,
                check_on_exit=True,
                verbosity=config.ext.monitoring.verbosity)
            if config.framework_params.get('_monitor_backend', False)
            # else contextlib.nullcontext  # Py 3.7+ only
            else contextlib.contextmanager(iter)([0]))
        with utils.Timer() as training:
            with monitor:
                aml.train(y=dataset.target.index, training_frame=train)

        if not aml.leader:
            raise FrameworkError(
                "H2O could not produce any model in the requested time.")

        with utils.Timer() as predict:
            preds = aml.predict(test)

        preds = extract_preds(preds, test, dataset=dataset)
        save_artifacts(aml, dataset=dataset, config=config)

        return result(output_file=config.output_predictions_file,
                      predictions=preds.predictions,
                      truth=preds.truth,
                      probabilities=preds.probabilities,
                      probabilities_labels=preds.probabilities_labels,
                      models_count=len(aml.leaderboard),
                      training_duration=training.duration,
                      predict_duration=predict.duration)

    finally:
        if h2o.connection():
            # h2o.remove_all()
            h2o.connection().close()
        if h2o.connection().local_server:
            h2o.connection().local_server.shutdown()