Beispiel #1
0
    def _handle_raw_fname(self, raw_fname, column_names=None):
        """
        Handle result of upload_file
        :param raw_fname: A raw key
        :return: Part of the H2OFrame constructor.
        """

        # perform the parse setup
        setup = h2o.parse_setup(raw_fname)

        # blocking parse, first line is always a header (since "we" wrote the data out)
        parse = h2o.parse(setup, H2OFrame.py_tmp_key(), first_line_is_header=1)

        # a hack to get the column names correct since "parse" does not provide them
        if column_names and not parse["columnNames"]:
            cols = column_names
        else:
            cols = parse['columnNames']

        # set the rows
        rows = parse['rows']

        # set the vector keys
        veckeys = parse['vecKeys']

        # create a new vec[] array
        self._vecs = H2OVec.new_vecs(zip(cols, veckeys), rows)

        # print some information on the *uploaded* data
        print "Uploaded", raw_fname, "into cluster with", \
            rows, "rows and", len(cols), "cols"
        print
Beispiel #2
0
 def _handle_text_key(self, text_key, column_names):
   """
   Handle result of upload_file
   :param test_key: A key pointing to raw text to be parsed
   :return: Part of the H2OFrame constructor.
   """
   # perform the parse setup
   setup = h2o.parse_setup(text_key)
   # blocking parse, first line is always a header (since "we" wrote the data out)
   parse = h2o.parse(setup, H2OFrame.py_tmp_key(), first_line_is_header=1)
   # a hack to get the column names correct since "parse" does not provide them
   cols = parse['column_names'] if parse["column_names"] else ["C" + str(x) for x in range(1,len(parse['vec_ids'])+1)]
   # set the rows
   rows = parse['rows']
   # set the vector keys
   veckeys = parse['vec_ids']
   # create a new vec[] array
   self._vecs = H2OVec.new_vecs(zip(cols, veckeys), rows)
   # print some information on the *uploaded* data
   print "Uploaded", text_key, "into cluster with", rows, "rows and", len(cols), "cols"
Beispiel #3
0
  def __init__(self, python_obj=None, local_fname=None, remote_fname=None, vecs=None, text_key=None):
    """
    Create a new H2OFrame object by passing a file path or a list of H2OVecs.

    If `remote_fname` is not None, then a REST call will be made to import the
    data specified at the location `remote_fname`.  This path is relative to the
    H2O cluster, NOT the local Python process

    If `local_fname` is not None, then the data is not imported into the H2O cluster
    at the time of object creation.

    If `python_obj` is not None, then an attempt to upload the python object to H2O
    will be made. A valid python object has type `list`, or `dict`.

    For more information on the structure of the input for the various native python
    data types ("native" meaning non-H2O), please see the general documentation for
    this object.

    :param python_obj: A "native" python object - list, dict, tuple.
    :param local_fname: A local path to a data source. Data is python-process-local.
    :param remote_fname: A remote path to a data source. Data is cluster-local.
    :param vecs: A list of H2OVec objects.
    :param text_key: A raw key resulting from an upload_file.
    :return: An instance of an H2OFrame object.
    """
    self.local_fname = local_fname
    self.remote_fname = remote_fname
    self._vecs = None

    if python_obj is not None:  # avoids the truth value of an array is ambiguous err
      self._upload_python_object(python_obj)
      return

    # Import the data into H2O cluster
    if remote_fname:
      rawkey = h2o.import_file(remote_fname)
      setup = h2o.parse_setup(rawkey)
      parse = h2o.parse(setup, H2OFrame.py_tmp_key())  # create a new key
      veckeys = parse['vec_ids']
      rows = parse['rows']
      cols = parse['column_names'] if parse["column_names"] else ["C" + str(x) for x in range(1,len(veckeys)+1)]
      self._vecs = H2OVec.new_vecs(zip(cols, veckeys), rows)
      print "Imported", remote_fname, "into cluster with", rows, "rows and", len(cols), "cols"

    # Read data locally into python process
    elif local_fname:
      with open(local_fname, 'rb') as csvfile:
        self._vecs = []
        for name in csvfile.readline().split(','):
          self._vecs.append(H2OVec(name.rstrip(), Expr([])))
        for row in csv.reader(csvfile):
          for i, data in enumerate(row):
            self._vecs[i].append(data)
      print "Imported", local_fname, "into local python process"

    # Construct from an array of Vecs already passed in
    elif vecs:
      vlen = len(vecs[0])
      for v in vecs:
        if not isinstance(v, H2OVec):
          raise ValueError("Not a list of Vecs")
        if len(v) != vlen:
          raise ValueError("Vecs not the same size: " + str(vlen) + " != " + str(len(v)))
      self._vecs = vecs

    elif text_key:
      self._handle_text_key(text_key, None)

    else:
      raise ValueError("Frame made from CSV file or an array of Vecs only")