def save_model(model, dir="", name="", filename="", force=False): """ Save an H2O Model Object to Disk. In the case of existing files force = TRUE will overwrite the file. Otherwise, the operation will fail. :param dir: string indicating the directory the model will be written to. :param name: string name of the file. :param filename: full path to the file. :param force: logical, indicates how to deal with files that already exist :return: the path of the model (string) """ if not isinstance(dir, str): raise ValueError("`dir` must be a character string") if dir == "": dir = os.getcwd() if not isinstance(name, str): raise ValueError("`name` must be a character string") if name == "": name = model._model_json["model_id"]["name"] if not isinstance(filename, str): raise ValueError("`filename` must be a character string") if not isinstance(force, bool): raise ValueError("`force` must be True or False") path = filename if filename != "" else os.path.join(dir, name) kwargs = dict([("dir", path), ("force", int(force)), ("_rest_version", 99)]) H2OConnection.get("Models.bin/" + model._model_json["model_id"]["name"], **kwargs) return path
def download_all_logs(dirname=".", filename=None): """ Download H2O Log Files to Disk :param dirname: (Optional) A character string indicating the directory that the log file should be saved in. :param filename: (Optional) A string indicating the name that the CSV file should be :return: path of logs written (as a string) """ url = 'http://' + H2OConnection.ip() + ':' + str( H2OConnection.port()) + '/Logs/download' response = urllib2.urlopen(url) if not os.path.exists(dirname): os.mkdir(dirname) if filename == None: for h in response.headers.headers: if 'filename=' in h: filename = h.split("filename=")[1].strip() break path = os.path.join(dirname, filename) with open(path, 'w') as f: response = urllib2.urlopen(url) f.write(response.read()) f.close() print "Writing H2O logs to " + path return path
def remove(key): """ Remove a key from H2O. :param key: The key pointing to the object to be removed. :return: void """ H2OConnection.delete("Remove", {"key": key})
def remove_all(): """ Remove all objects from H2O. :return None """ H2OConnection.delete("DKV")
def _resolve_model(future_model, **kwargs): future_model.poll() if "_rest_version" in kwargs.keys(): model_json = H2OConnection.get_json( "Models/" + future_model.job.dest_key, _rest_version=kwargs["_rest_version"] )["models"][0] else: model_json = H2OConnection.get_json("Models/" + future_model.job.dest_key)["models"][0] model_type = model_json["output"]["model_category"] if model_type == "Binomial": model = H2OBinomialModel(future_model.job.dest_key, model_json) elif model_type == "Clustering": model = H2OClusteringModel(future_model.job.dest_key, model_json) elif model_type == "Regression": model = H2ORegressionModel(future_model.job.dest_key, model_json) elif model_type == "Multinomial": model = H2OMultinomialModel(future_model.job.dest_key, model_json) elif model_type == "AutoEncoder": model = H2OAutoEncoderModel(future_model.job.dest_key, model_json) elif model_type == "DimReduction": model = H2ODimReductionModel(future_model.job.dest_key, model_json) else: raise NotImplementedError(model_type) return model
def _resolve_model(future_model, **kwargs): future_model.poll() if '_rest_version' in kwargs.keys(): model_json = H2OConnection.get_json( "Models/" + future_model.job.dest_key, _rest_version=kwargs['_rest_version'])["models"][0] else: model_json = H2OConnection.get_json( "Models/" + future_model.job.dest_key)["models"][0] model_type = model_json["output"]["model_category"] if model_type == "Binomial": model = H2OBinomialModel(future_model.job.dest_key, model_json) elif model_type == "Clustering": model = H2OClusteringModel(future_model.job.dest_key, model_json) elif model_type == "Regression": model = H2ORegressionModel(future_model.job.dest_key, model_json) elif model_type == "Multinomial": model = H2OMultinomialModel(future_model.job.dest_key, model_json) elif model_type == "AutoEncoder": model = H2OAutoEncoderModel(future_model.job.dest_key, model_json) elif model_type == "DimReduction": model = H2ODimReductionModel(future_model.job.dest_key, model_json) else: raise NotImplementedError(model_type) return model
def download_all_logs(dirname=".",filename=None): """ Download H2O Log Files to Disk :param dirname: (Optional) A character string indicating the directory that the log file should be saved in. :param filename: (Optional) A string indicating the name that the CSV file should be :return: path of logs written (as a string) """ url = 'http://' + H2OConnection.ip() + ':' + str(H2OConnection.port()) + '/Logs/download' response = urllib2.urlopen(url) if not os.path.exists(dirname): os.mkdir(dirname) if filename == None: for h in response.headers.headers: if 'filename=' in h: filename = h.split("filename=")[1].strip() break path = os.path.join(dirname,filename) with open(path, 'w') as f: response = urllib2.urlopen(url) f.write(response.read()) f.close() print "Writing H2O logs to " + path return path
def cluster_info(): """ Display the current H2O cluster information. :return: None """ H2OConnection._cluster_info()
def _model_build(x,y,validation_x,validation_y,algo_url,kwargs): # Basic sanity checking if algo_url == "autoencoder": if "autoencoder" in kwargs.keys(): if kwargs["autoencoder"]: if y: raise ValueError("`y` should not be specified for autoencoder, remove `y` input.") algo_url="deeplearning" if not x: raise ValueError("Missing features") x = _check_frame(x,y,y) if validation_x: validation_x = _check_frame(validation_x,validation_y,y) # Send frame descriptions to H2O cluster train_key = x.send_frame() kwargs['training_frame']=train_key if validation_x is not None: valid_key = validation_x.send_frame() kwargs['validation_frame']=valid_key if y: kwargs['response_column']=y._name kwargs = dict([(k, kwargs[k]) for k in kwargs if kwargs[k] is not None]) # launch the job and poll job = H2OJob(H2OConnection.post_json("ModelBuilders/"+algo_url, **kwargs), job_type=(algo_url+" Model Build")).poll() model_json = H2OConnection.get_json("Models/"+job.dest_key)["models"][0] model_type = model_json["output"]["model_category"] if model_type=="Binomial": from model.binomial import H2OBinomialModel model = H2OBinomialModel(job.dest_key,model_json) elif model_type=="Clustering": from model.clustering import H2OClusteringModel model = H2OClusteringModel(job.dest_key,model_json) elif model_type=="Regression": from model.regression import H2ORegressionModel model = H2ORegressionModel(job.dest_key,model_json) elif model_type=="Multinomial": from model.multinomial import H2OMultinomialModel model = H2OMultinomialModel(job.dest_key,model_json) elif model_type=="AutoEncoder": from model.autoencoder import H2OAutoEncoderModel model = H2OAutoEncoderModel(job.dest_key,model_json) else: print model_type raise NotImplementedError # Cleanup h2o.remove(train_key) if validation_x: h2o.remove(valid_key) return model
def _model_build(x,y,validation_x,validation_y,algo_url,kwargs): # Basic sanity checking if algo_url == "autoencoder": if "autoencoder" in kwargs.keys(): if kwargs["autoencoder"]: if y: raise ValueError("`y` should not be specified for autoencoder, remove `y` input.") algo_url="deeplearning" if not x: raise ValueError("Missing features") x = _check_frame(x,y,y) if validation_x: validation_x = _check_frame(validation_x,validation_y,y) # Send frame descriptions to H2O cluster train_key = x.send_frame() kwargs['training_frame']=train_key if validation_x is not None: valid_key = validation_x.send_frame() kwargs['validation_frame']=valid_key if y: kwargs['response_column']=y._name kwargs = dict([(k, kwargs[k]) for k in kwargs if kwargs[k] is not None]) # launch the job and poll job = H2OJob(H2OConnection.post_json("ModelBuilders/"+algo_url, **kwargs), job_type=(algo_url+" Model Build")).poll() model_json = H2OConnection.get_json("Models/"+job.dest_key)["models"][0] model_type = model_json["output"]["model_category"] if model_type=="Binomial": from model.binomial import H2OBinomialModel model = H2OBinomialModel(job.dest_key,model_json) elif model_type=="Clustering": from model.clustering import H2OClusteringModel model = H2OClusteringModel(job.dest_key,model_json) elif model_type=="Regression": from model.regression import H2ORegressionModel model = H2ORegressionModel(job.dest_key,model_json) elif model_type=="Multinomial": from model.multinomial import H2OMultinomialModel model = H2OMultinomialModel(job.dest_key,model_json) elif model_type=="AutoEncoder": from model.autoencoder import H2OAutoEncoderModel model = H2OAutoEncoderModel(job.dest_key,model_json) else: print model_type raise NotImplementedError # Cleanup h2o.delete(train_key) if validation_x: h2o.delete(valid_key) return model
def remove(key): """ Remove key from H2O. :param key: The key pointing to the object to be removed. :return: Void """ H2OConnection.delete("Remove", key=key)
def _upload_raw_data(self, tmp_file_path, column_names): # file upload info is the normalized path to a local file fui = {"file": os.path.abspath(tmp_file_path)} # create a random name for the data dest_key = H2OFrame.py_tmp_key() # do the POST -- blocking, and "fast" (does not real data upload) H2OConnection.post_json("PostFile", fui, destination_frame=dest_key) # actually parse the data and setup self._vecs self._handle_text_key(dest_key, column_names)
def export_file(frame,path,force=False): """ Export a given H2OFrame to a path on the machine this python session is currently connected to. To view the current session, call h2o.cluster_info(). :param frame: The Frame to save to disk. :param path: The path to the save point on disk. :param force: Overwrite any preexisting file with the same path :return: None """ H2OConnection.get_json("Frames/"+frame._id+"/export/"+path+"/overwrite/"+("true" if force else "false"))
def shutdown(conn=None, prompt=True): """ Shut down the specified instance. All data will be lost. This method checks if H2O is running at the specified IP address and port, and if it is, shuts down that H2O instance. :param conn: An H2OConnection object containing the IP address and port of the server running H2O. :param prompt: A logical value indicating whether to prompt the user before shutting down the H2O server. :return: None """ if conn == None: conn = H2OConnection.current_connection() H2OConnection._shutdown(conn=conn, prompt=prompt)
def _as_data_frame(id, use_pandas): url = 'http://' + H2OConnection.ip() + ':' + str(H2OConnection.port()) + "/3/DownloadDataset?frame_id=" + urllib.quote(id) + "&hex_string=false" response = urllib2.urlopen(url) if use_pandas: import pandas return pandas.read_csv(response, low_memory=False) else: cr = csv.reader(response) rows = [] for row in cr: rows.append(row) return rows
def remove(key): """ Remove key from H2O. :param key: The key pointing to the object to be removed. :return: Void """ if key is None: raise ValueError("remove with no key is not supported, for your protection") H2OConnection.delete("DKV/" + key)
def remove(object): """ Remove object from H2O. This is a "hard" delete of the object. It removes all subparts. :param object: The object pointing to the object to be removed. :return: None """ if object is None: raise ValueError("remove with no object is not supported, for your protection") if isinstance(object, H2OFrame): H2OConnection.delete("DKV/"+object._id) if isinstance(object, str): H2OConnection.delete("DKV/"+object)
def upload_file(path, destination_frame=""): """ Upload a dataset at the path given from the local machine to the H2O cluster. :param path: A path specifying the location of the data to upload. :param destination_frame: The name of the H2O Frame in the H2O Cluster. :return: A new H2OFrame """ fui = {"file": os.path.abspath(path)} destination_frame = H2OFrame.py_tmp_key() if destination_frame == "" else destination_frame H2OConnection.post_json(url_suffix="PostFile", file_upload_info=fui,destination_frame=destination_frame) return H2OFrame(text_key=destination_frame)
def upload_file(path, destination_key=""): """ Upload a dataset at the path given from the local machine to the H2O cluster. :param path: A path specifying the location of the data to upload. :param destination_key: The name of the H2O Frame in the H2O Cluster. :return: A new H2OFrame """ fui = {"file": os.path.abspath(path)} dest_key = H2OFrame.py_tmp_key() if destination_key == "" else destination_key H2OConnection.post_json(url_suffix="PostFile", file_upload_info=fui,destination_key=dest_key) return H2OFrame(text_key=dest_key)
def export_file(frame, path, force=False): """ Export a given H2OFrame to a path on the machine this python session is currently connected to. To view the current session, call h2o.cluster_info(). :param frame: The Frame to save to disk. :param path: The path to the save point on disk. :param force: Overwrite any preexisting file with the same path :return: None """ fr = H2OFrame.send_frame(frame) f = "true" if force else "false" H2OConnection.get_json("Frames/" + str(fr) + "/export/" + path + "/overwrite/" + f)
def log_and_echo(message): """ Log a message on the server-side logs This is helpful when running several pieces of work one after the other on a single H2O cluster and you want to make a notation in the H2O server side log where one piece of work ends and the next piece of work begins. Sends a message to H2O for logging. Generally used for debugging purposes. :param message: A character string with the message to write to the log. :return: None """ if message is None: message = "" H2OConnection.post_json("LogAndEcho", message=message)
def _as_data_frame(id, use_pandas): url = 'http://' + H2OConnection.ip() + ':' + str( H2OConnection.port()) + "/3/DownloadDataset?frame_id=" + urllib.quote( id) + "&hex_string=false" response = urllib2.urlopen(url) if use_pandas: import pandas return pandas.read_csv(response, low_memory=False) else: cr = csv.reader(response) rows = [] for row in cr: rows.append(row) return rows
def rapids(expr): """ Fire off a Rapids expression :param expr: The rapids expression (ascii string) :return: The JSON response of the Rapids execution. """ return H2OConnection.post_json(url_suffix="Rapids", params={"ast": urllib.quote(expr)})
def _upload_raw_data(self, tmp_file_path, column_names): # file upload info is the normalized path to a local file fui = {"file": os.path.abspath(tmp_file_path)} # create a random name for the data dest_key = H2OFrame.py_tmp_key() # params to the URL are the destination key that was just made in the prev step. p = {'destination_key': dest_key} # do the POST -- blocking, and "fast" (does not real data upload) H2OConnection.post_json(url_suffix="PostFile", params=p, file_upload_info=fui) # actually parse the data and setup self._vecs self._handle_raw_fname(dest_key, column_names=column_names)
def cluster_status(): """ TODO: This isn't really a cluster status... it's a node status check for the node we're connected to. This is possibly confusing because this can come back without warning, but if a user tries to do any remoteSend, they will get a "cloud sick warning" Retrieve information on the status of the cluster running H2O. :return: None """ cluster_json = H2OConnection.get_json("Cloud?skip_ticks=true") print "Version: {0}".format(cluster_json['version']) print "Cloud name: {0}".format(cluster_json['cloud_name']) print "Cloud size: {0}".format(cluster_json['cloud_size']) if cluster_json['locked']: print "Cloud is locked\n" else: print "Accepting new members\n" if cluster_json['nodes'] == None or len(cluster_json['nodes']) == 0: print "No nodes found" return status = [] for node in cluster_json['nodes']: for k, v in zip(node.keys(),node.values()): if k in ["h2o", "healthy", "last_ping", "num_cpus", "sys_load", "mem_value_size", "total_value_size", "free_mem", "tot_mem", "max_mem", "free_disk", "max_disk", "pid", "num_keys", "tcps_active", "open_fds", "rpcs_active"]: status.append(k+": {0}".format(v)) print ', '.join(status) print
def _model_build(x, y, vx, vy, algo, offsets, weights, fold_column, kwargs): if x is None: raise ValueError("Missing features") x = _check_frame(x, y, y) vx = _check_frame(vx, vy, y) if offsets is not None: x, vx = _check_col(x, vx, kwargs["validation_frame"], offsets) if weights is not None: x, vx = _check_col(x, vx, kwargs["validation_frame"], weights) if fold_column is not None: x, vx = _check_col(x, vx, kwargs["validation_frame"], fold_column) kwargs["training_frame"] = x._id if vx is not None: kwargs["validation_frame"] = vx._id if y is not None: kwargs["response_column"] = y._col_names[0] kwargs = dict( [ (k, kwargs[k]._frame()._id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None ] ) do_future = kwargs.pop("do_future") if "do_future" in kwargs else False future_model = H2OModelFuture( H2OJob(H2OConnection.post_json("ModelBuilders/" + algo, **kwargs), job_type=(algo + " Model Build")), x ) return future_model if do_future else _resolve_model(future_model, **kwargs)
def frames(): """ Retrieve all the Frames. :return: Meta information on the frames """ return H2OConnection.get_json("Frames")
def interaction(data, factors, pairwise, max_factors, min_occurrence, destination_frame=None): """ Categorical Interaction Feature Creation in H2O. Creates a frame in H2O with n-th order interaction features between categorical columns, as specified by the user. :param data: the H2OFrame that holds the target categorical columns. :param factors: factors Factor columns (either indices or column names). :param pairwise: Whether to create pairwise interactions between factors (otherwise create one higher-order interaction). Only applicable if there are 3 or more factors. :param max_factors: Max. number of factor levels in pair-wise interaction terms (if enforced, one extra catch-all factor will be made) :param min_occurrence: Min. occurrence threshold for factor levels in pair-wise interaction terms :param destination_frame: A string indicating the destination key. If empty, this will be auto-generated by H2O. :return: H2OFrame """ data._eager() factors = [data.names()[n] if isinstance(n,int) else n for n in factors] parms = {"dest": _py_tmp_key() if destination_frame is None else destination_frame, "source_frame": data._id, "factor_columns": [_quoted(f) for f in factors], "pairwise": pairwise, "max_factors": max_factors, "min_occurrence": min_occurrence, } H2OJob(H2OConnection.post_json("Interaction", **parms), "Interactions").poll() return get_frame(parms["dest"])
def frame(key): """ Retrieve metadata for a key that points to a Frame. :param key: A pointer to a Frame in H2O. :return: Meta information on the Frame. """ return H2OConnection.get_json(url_suffix="Frames/" + key)
def _model_build(x, y, vx, vy, algo, offsets, weights, fold_column, kwargs): if x is None: raise ValueError("Missing features") x = _check_frame(x, y, y) vx = _check_frame(vx, vy, y) if offsets is not None: x, vx = _check_col(x, vx, kwargs["validation_frame"], offsets) if weights is not None: x, vx = _check_col(x, vx, kwargs["validation_frame"], weights) if fold_column is not None: x, vx = _check_col(x, vx, kwargs["validation_frame"], fold_column) kwargs['training_frame'] = x.frame_id if vx is not None: kwargs['validation_frame'] = vx.frame_id if y is not None: kwargs['response_column'] = y.names[0] kwargs = dict([ (k, kwargs[k].frame_id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None ]) do_future = kwargs.pop("do_future") if "do_future" in kwargs else False future_model = H2OModelFuture( H2OJob(H2OConnection.post_json("ModelBuilders/" + algo, **kwargs), job_type=(algo + " Model Build")), x) return future_model if do_future else _resolve_model( future_model, **kwargs)
def get_model(model_id): """ Return the specified model :param model_id: The model identification in h2o """ model_json = H2OConnection.get_json("Models/"+model_id)["models"][0] model_type = model_json["output"]["model_category"] if model_type=="Binomial": from model.binomial import H2OBinomialModel model = H2OBinomialModel(model_id, model_json) elif model_type=="Clustering": from model.clustering import H2OClusteringModel model = H2OClusteringModel(model_id, model_json) elif model_type=="Regression": from model.regression import H2ORegressionModel model = H2ORegressionModel(model_id, model_json) elif model_type=="Multinomial": from model.multinomial import H2OMultinomialModel model = H2OMultinomialModel(model_id, model_json) elif model_type=="AutoEncoder": from model.autoencoder import H2OAutoEncoderModel model = H2OAutoEncoderModel(model_id, model_json) else: print model_type raise NotImplementedError return model
def init(ip="localhost", port=54321, size=1, start_h2o=False, enable_assertions=False, license=None, max_mem_size_GB=None, min_mem_size_GB=None, ice_root=None, strict_version_check=True): """ Initiate an H2O connection to the specified ip and port. :param ip: An IP address, default is "localhost" :param port: A port, default is 54321 :param size: THe expected number of h2o instances (ignored if start_h2o is True) :param start_h2o: A boolean dictating whether this module should start the H2O jvm. An attempt is made anyways if _connect fails. :param enable_assertions: If start_h2o, pass `-ea` as a VM option.s :param license: If not None, is a path to a license file. :param max_mem_size_GB: Maximum heap size (jvm option Xmx) in gigabytes. :param min_mem_size_GB: Minimum heap size (jvm option Xms) in gigabytes. :param ice_root: A temporary directory (default location is determined by tempfile.mkdtemp()) to hold H2O log files. :return: None """ H2OConnection(ip=ip, port=port, start_h2o=start_h2o, enable_assertions=enable_assertions, license=license, max_mem_size_GB=max_mem_size_GB, min_mem_size_GB=min_mem_size_GB, ice_root=ice_root, strict_version_check=strict_version_check) return None
def get_model(model_id): """ Return the specified model :param model_id: The model identification in h2o """ model_json = H2OConnection.get_json("Models/" + model_id)["models"][0] model_type = model_json["output"]["model_category"] if model_type == "Binomial": from model.binomial import H2OBinomialModel model = H2OBinomialModel(model_id, model_json) elif model_type == "Clustering": from model.clustering import H2OClusteringModel model = H2OClusteringModel(model_id, model_json) elif model_type == "Regression": from model.regression import H2ORegressionModel model = H2ORegressionModel(model_id, model_json) elif model_type == "Multinomial": from model.multinomial import H2OMultinomialModel model = H2OMultinomialModel(model_id, model_json) elif model_type == "AutoEncoder": from model.autoencoder import H2OAutoEncoderModel model = H2OAutoEncoderModel(model_id, model_json) else: print model_type raise NotImplementedError return model
def cluster_status(): """ TODO: This isn't really a cluster status... it's a node status check for the node we're connected to. This is possibly confusing because this can come back without warning, but if a user tries to do any remoteSend, they will get a "cloud sick warning" Retrieve information on the status of the cluster running H2O. :return: None """ cluster_json = H2OConnection.get_json("Cloud?skip_ticks=true") print "Version: {0}".format(cluster_json['version']) print "Cloud name: {0}".format(cluster_json['cloud_name']) print "Cloud size: {0}".format(cluster_json['cloud_size']) if cluster_json['locked']: print "Cloud is locked\n" else: print "Accepting new members\n" if cluster_json['nodes'] == None or len(cluster_json['nodes']) == 0: print "No nodes found" return status = [] for node in cluster_json['nodes']: for k, v in zip(node.keys(), node.values()): if k in [ "h2o", "healthy", "last_ping", "num_cpus", "sys_load", "mem_value_size", "total_value_size", "free_mem", "tot_mem", "max_mem", "free_disk", "max_disk", "pid", "num_keys", "tcps_active", "open_fds", "rpcs_active" ]: status.append(k + ": {0}".format(v)) print ', '.join(status) print
def _model_build(x,y,validation_x,validation_y,algo_url,kwargs): # Basic sanity checking if algo_url == "autoencoder": if "autoencoder" in kwargs.keys(): if kwargs["autoencoder"]: if y: raise ValueError("`y` should not be specified for autoencoder, remove `y` input.") algo_url="deeplearning" if not x: raise ValueError("Missing features") x = _check_frame(x,y,y) if validation_x is not None: validation_x = _check_frame(validation_x,validation_y,y) if "weights_column" in kwargs.keys(): x, validation_x = _add_col_to_x_and_validation_x(kwargs["weights_column"],x, validation_x, kwargs) if "offset_column" in kwargs.keys(): x, validation_x = _add_col_to_x_and_validation_x(kwargs["offset_column"], x, validation_x, kwargs) if "fold_column" in kwargs.keys(): x, validation_x = _add_col_to_x_and_validation_x(kwargs["fold_column"], x, validation_x, kwargs, xval=True) # Send frame descriptions to H2O cluster kwargs['training_frame']=x._id if validation_x is not None: kwargs['validation_frame']=validation_x._id if y is not None: kwargs['response_column']=y._col_names[0] kwargs = dict([(k, kwargs[k]._frame()._id if isinstance(kwargs[k], H2OFrame) else kwargs[k]) for k in kwargs if kwargs[k] is not None]) # launch the job (only resolve the model if do_future is False) do_future = "do_future" in kwargs.keys() and kwargs["do_future"] if "do_future" in kwargs.keys(): kwargs.pop("do_future") future_model = H2OModelFuture(H2OJob(H2OConnection.post_json("ModelBuilders/"+algo_url, **kwargs), job_type=(algo_url+" Model Build")), x) if do_future: return future_model else: return _resolve_model(future_model, **kwargs)
def download_csv(data, filename): """ Download an H2O data set to a CSV file on the local disk. Warning: Files located on the H2O server may be very large! Make sure you have enough hard drive space to accommodate the entire file. :param data: an H2OFrame object to be downloaded. :param filename:A string indicating the name that the CSV file should be should be saved to. :return: None """ if not isinstance(data, H2OFrame): raise(ValueError, "`data` argument must be an H2OFrame, but got " + type(data)) url = "http://{}:{}/3/DownloadDataset?frame_id={}".format(H2OConnection.ip(),H2OConnection.port(),data._id) with open(filename, 'w') as f: response = urllib2.urlopen(url) f.write(response.read()) f.close()
def frame(frame_id): """ Retrieve metadata for a id that points to a Frame. :param frame_id: A pointer to a Frame in H2O. :return: Meta information on the frame """ return H2OConnection.get_json("Frames/" + urllib.quote(frame_id))
def frame(key): """ Retrieve metadata for a key that points to a Frame. :param key: A pointer to a Frame in H2O. :return: Meta information on the frame """ return H2OConnection.get_json("Frames/" + key)
def _model_build(x, y, validation_x, validation_y, algo_url, kwargs): # Basic sanity checking if not x: raise ValueError("Missing features") x = _check_frame(x, y, y) if validation_x: validation_x = _check_frame(validation_x, validation_y, y) # Send frame descriptions to H2O cluster train_key = x.send_frame() kwargs['training_frame'] = train_key if validation_x: valid_key = validation_x.send_frame() kwargs['validation_frame'] = valid_key if y: kwargs['response_column'] = y._name # launch the job and poll job = H2OJob(H2OConnection.post_json("ModelBuilders/" + algo_url, **kwargs), job_type=(algo_url + " Model Build")).poll() model_json = H2OConnection.get_json("Models/" + job.dest_key)["models"][0] model_type = model_json["output"]["model_category"] if model_type == "Binomial": from model.binomial import H2OBinomialModel model = H2OBinomialModel(job.dest_key, model_json) elif model_type == "Clustering": from model.clustering import H2OClusteringModel model = H2OClusteringModel(job.dest_key, model_json) elif model_type == "Regression": from model.regression import H2ORegressionModel model = H2ORegressionModel(job.dest_key, model_json) else: print model_type raise NotImplementedError # Cleanup h2o.remove(train_key) if validation_x: h2o.remove(valid_key) return model
def init(ip="localhost", port=54321): """ Initiate an H2O connection to the specified ip and port :param ip: An IP address, default is "localhost" :param port: A port, default is 54321 :return: None """ H2OConnection(ip=ip, port=port) return None
def parse_setup(raw_frames, column_types=None): """ :param raw_frames: A collection of imported file frames :return: A ParseSetup "object" """ # The H2O backend only accepts things that are quoted if isinstance(raw_frames, unicode): raw_frames = [raw_frames] if column_types is not None: j = H2OConnection.post_json( url_suffix="ParseSetup", source_frames=[_quoted(id) for id in raw_frames], column_types=[_quoted(id) for id in column_types]) else: j = H2OConnection.post_json( url_suffix="ParseSetup", source_frames=[_quoted(id) for id in raw_frames]) return j
def get_frame(frame_id): if frame_id is None: raise ValueError("frame_id must not be None") res = H2OConnection.get_json("Frames/" + urllib.quote(frame_id)) res = res["frames"][0] colnames = [v["label"] for v in res["columns"]] veckeys = res["vec_ids"] vecs = H2OVec.new_vecs(zip(colnames, veckeys), res["rows"]) return H2OFrame(vecs=vecs)
def get_frame(frame_id): if frame_id is None: raise ValueError("frame_id must not be None") res = H2OConnection.get_json("Frames/"+urllib.quote(frame_id)) res = res["frames"][0] colnames = [v["label"] for v in res["columns"]] veckeys = res["vec_ids"] vecs=H2OVec.new_vecs(zip(colnames, veckeys), res["rows"]) return H2OFrame(vecs=vecs)
def parse(setup, h2o_name, first_line_is_header=(-1, 0, 1)): """ Trigger a parse; blocking; removeFrame just keep the Vecs. :param setup: The result of calling parse_setup. :param h2o_name: The name of the H2O Frame on the back end. :param first_line_is_header: -1 means data, 0 means guess, 1 means header. :return: A new parsed object """ # Parse parameters (None values provided by setup) p = { 'destination_frame': h2o_name, 'parse_type': None, 'separator': None, 'single_quotes': None, 'check_header': None, 'number_columns': None, 'chunk_size': None, 'delete_on_done': True, 'blocking': True, 'remove_frame': True } if isinstance(first_line_is_header, tuple): first_line_is_header = setup["check_header"] if setup["column_names"]: setup["column_names"] = [ _quoted(name) for name in setup["column_names"] ] p["column_names"] = None if setup["column_types"]: setup["column_types"] = [ _quoted(name) for name in setup["column_types"] ] p["column_types"] = None if setup["na_strings"]: setup["na_strings"] = [[_quoted(na) for na in col] if col is not None else [] for col in setup["na_strings"]] p["na_strings"] = None # update the parse parameters with the parse_setup values p.update({k: v for k, v in setup.iteritems() if k in p}) p["check_header"] = first_line_is_header # Extract only 'name' from each src in the array of srcs p['source_frames'] = [ _quoted(src['name']) for src in setup['source_frames'] ] # Request blocking parse j = H2OJob(H2OConnection.post_json(url_suffix="Parse", **p), "Parse").poll() return j.jobs
def load_model(path): """ Load a saved H2O model from disk. :param path: The full path of the H2O Model to be imported. :return: the model """ if not isinstance(path, str): raise ValueError("`path` must be a non-empty character string") kwargs = dict([("dir",path), ("_rest_version", 99)]) res = H2OConnection.post("Models.bin/", **kwargs) return get_model(res.json()['models'][0]['model_id']['name'])
def frame_summary(key): """ Retrieve metadata and summary information for a key that points to a Frame/Vec :param key: A pointer to a Frame/Vec in H2O :return: Meta and summary info on the frame """ # frames_meta = H2OConnection.get_json("Frames/" + key) frame_summary = H2OConnection.get_json("Frames/" + urllib.quote(key) + "/summary") return frame_summary
def parse_setup(raw_frames): """ :param raw_frames: A collection of imported file frames :return: A ParseSetup "object" """ # The H2O backend only accepts things that are quoted if isinstance(raw_frames, unicode): raw_frames = [raw_frames] j = H2OConnection.post_json(url_suffix="ParseSetup", source_frames=[_quoted(id) for id in raw_frames]) return j
def rapids(expr): """ Fire off a Rapids expression. :param expr: The rapids expression (ascii string). :return: The JSON response of the Rapids execution """ result = H2OConnection.post_json("Rapids", ast=urllib.quote(expr)) if result['error'] is not None: raise EnvironmentError("rapids expression not evaluated: {0}".format(str(result['error']))) return result
def parse_setup(rawkey): """ :param rawkey: A collection of imported file keys :return: A ParseSetup "object" """ # So the st00pid H2O backend only accepts things that are quoted (nasty Java) if isinstance(rawkey, unicode): rawkey = [rawkey] j = H2OConnection.post_json(url_suffix="ParseSetup", source_keys=[_quoted(key) for key in rawkey]) if not j['is_valid']: raise ValueError("ParseSetup not Valid", j) return j