def _handle_raw_fname(self, raw_fname, column_names=None): """ Handle result of upload_file :param raw_fname: A raw key :return: Part of the H2OFrame constructor. """ # perform the parse setup setup = h2o.parse_setup(raw_fname) # blocking parse, first line is always a header (since "we" wrote the data out) parse = h2o.parse(setup, H2OFrame.py_tmp_key(), first_line_is_header=1) # a hack to get the column names correct since "parse" does not provide them if column_names and not parse["columnNames"]: cols = column_names else: cols = parse['columnNames'] # set the rows rows = parse['rows'] # set the vector keys veckeys = parse['vecKeys'] # create a new vec[] array self._vecs = H2OVec.new_vecs(zip(cols, veckeys), rows) # print some information on the *uploaded* data print "Uploaded", raw_fname, "into cluster with", \ rows, "rows and", len(cols), "cols" print
def _handle_text_key(self, text_key, column_names): """ Handle result of upload_file :param test_key: A key pointing to raw text to be parsed :return: Part of the H2OFrame constructor. """ # perform the parse setup setup = h2o.parse_setup(text_key) # blocking parse, first line is always a header (since "we" wrote the data out) parse = h2o.parse(setup, H2OFrame.py_tmp_key(), first_line_is_header=1) # a hack to get the column names correct since "parse" does not provide them cols = parse['column_names'] if parse["column_names"] else ["C" + str(x) for x in range(1,len(parse['vec_ids'])+1)] # set the rows rows = parse['rows'] # set the vector keys veckeys = parse['vec_ids'] # create a new vec[] array self._vecs = H2OVec.new_vecs(zip(cols, veckeys), rows) # print some information on the *uploaded* data print "Uploaded", text_key, "into cluster with", rows, "rows and", len(cols), "cols"
def __init__(self, python_obj=None, local_fname=None, remote_fname=None, vecs=None, text_key=None): """ Create a new H2OFrame object by passing a file path or a list of H2OVecs. If `remote_fname` is not None, then a REST call will be made to import the data specified at the location `remote_fname`. This path is relative to the H2O cluster, NOT the local Python process If `local_fname` is not None, then the data is not imported into the H2O cluster at the time of object creation. If `python_obj` is not None, then an attempt to upload the python object to H2O will be made. A valid python object has type `list`, or `dict`. For more information on the structure of the input for the various native python data types ("native" meaning non-H2O), please see the general documentation for this object. :param python_obj: A "native" python object - list, dict, tuple. :param local_fname: A local path to a data source. Data is python-process-local. :param remote_fname: A remote path to a data source. Data is cluster-local. :param vecs: A list of H2OVec objects. :param text_key: A raw key resulting from an upload_file. :return: An instance of an H2OFrame object. """ self.local_fname = local_fname self.remote_fname = remote_fname self._vecs = None if python_obj is not None: # avoids the truth value of an array is ambiguous err self._upload_python_object(python_obj) return # Import the data into H2O cluster if remote_fname: rawkey = h2o.import_file(remote_fname) setup = h2o.parse_setup(rawkey) parse = h2o.parse(setup, H2OFrame.py_tmp_key()) # create a new key veckeys = parse['vec_ids'] rows = parse['rows'] cols = parse['column_names'] if parse["column_names"] else ["C" + str(x) for x in range(1,len(veckeys)+1)] self._vecs = H2OVec.new_vecs(zip(cols, veckeys), rows) print "Imported", remote_fname, "into cluster with", rows, "rows and", len(cols), "cols" # Read data locally into python process elif local_fname: with open(local_fname, 'rb') as csvfile: self._vecs = [] for name in csvfile.readline().split(','): self._vecs.append(H2OVec(name.rstrip(), Expr([]))) for row in csv.reader(csvfile): for i, data in enumerate(row): self._vecs[i].append(data) print "Imported", local_fname, "into local python process" # Construct from an array of Vecs already passed in elif vecs: vlen = len(vecs[0]) for v in vecs: if not isinstance(v, H2OVec): raise ValueError("Not a list of Vecs") if len(v) != vlen: raise ValueError("Vecs not the same size: " + str(vlen) + " != " + str(len(v))) self._vecs = vecs elif text_key: self._handle_text_key(text_key, None) else: raise ValueError("Frame made from CSV file or an array of Vecs only")