Example #1
0
 def show(self, noprint=False):
     """
 Evaluate and print.
 :return:
 """
     self.eager()
     if noprint:
         if isinstance(self._data, unicode):
             j = h2o.frame(self._data)
             data = j['frames'][0]['columns'][0]['data'][0:10]
             return data
         return self._data
     else:
         if isinstance(self._data, unicode):
             j = h2o.frame(self._data)
             data = [c['data'] for c in j['frames'][0]['columns'][:]]
         elif isinstance(self._data, (int, float, str, list)):
             print self._data
             print
             return
         else:
             data = [self._data]
         t_data = map(list, zip(*data))
         for didx, d in enumerate(t_data):
             t_data[didx].insert(0, didx)
         headers = ["Row ID"]
         for i in range(len(t_data[0])):
             headers.append('')
         print "Displaying first " + str(len(t_data)) + " row(s)"
         print tabulate.tabulate(t_data, headers=headers)
         print
Example #2
0
  def describe(self):
    """
    Generate an in-depth description of this H2OFrame.

    The description is a tabular print of the type, min, max, sigma, number of zeros,
    and number of missing elements for each H2OVec in this H2OFrame.

    :return: None (print to stdout)
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    thousands_sep = h2o.H2ODisplay.THOUSANDS
    print "Rows:", thousands_sep.format(len(self._vecs[0])), "Cols:", thousands_sep.format(len(self))
    headers = [vec._name for vec in self._vecs]
    table = [
      self._row('type', None),
      self._row('mins', 0),
      self._row('mean', None),
      self._row('maxs', 0),
      self._row('sigma', None),
      self._row('zero_count', None),
      self._row('missing_count', None)
    ]
    chunk_summary_tmp_key = H2OFrame.send_frame(self)
    chunk_summary = h2o.frame(chunk_summary_tmp_key)["frames"][0]["chunk_summary"]
    dist_summary = h2o.frame(chunk_summary_tmp_key)["frames"][0]["distribution_summary"]
    h2o.delete(chunk_summary_tmp_key)
    chunk_summary.show()
    dist_summary.show()
    h2o.H2ODisplay(table, [""] + headers, "Column-by-Colum Summary")
Example #3
0
def import_folder():

    tol_time = 200  # comparing in ms or ns
    tol_numeric = 1e-5  # tolerance for comparing other numeric fields
    numElements2Compare = 100  # choose number of elements per column to compare.  Save test time.

    # compressed the whole directory of files.
    multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/parser/hexdev_497/milsongs_csv.zip"))

    # directory containing the gzip version of csv files here.
    multi_file_csv = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/parser/hexdev_497/milsongs_csv_gzip"))

    try:
        # make sure the two agrees
        assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time,
                                           tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \
                                                               "files are different!"
    except:  # in case the files are listed differently, we can always just check to see if the summary agrees.
        multi_file_gzip_comp.summary()
        zip_summary = h2o.frame(
            multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

        multi_file_csv.summary()
        csv_summary = h2o.frame(
            multi_file_csv.frame_id)["frames"][0]["columns"]
        pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
Example #4
0
 def show(self, noprint=False):
     """
 Evaluate and print.
 :return:
 """
     self.eager()
     if noprint:
         if isinstance(self._data, unicode):
             j = h2o.frame(self._data)
             data = j['frames'][0]['columns'][0]['data'][0:10]
             return data
         return self._data
     else:
         if isinstance(self._data, unicode):
             j = h2o.frame(self._data)
             data = j['frames'][0]['columns'][0]['data'][0:10]
         elif isinstance(self._data, int):
             print self._data
             return
         else:
             data = [self._data]
         header = self._vecname + " (first " + str(len(data)) + " row(s))"
         rows = range(1, len(data) + 1, 1)
         print tabulate.tabulate(zip(rows, data),
                                 headers=["Row ID", header])
         print
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        hdfs_orc_file = "/datasets/orc_parser/air05_orc"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        hdfs_csv_file = "/datasets/orc_parser/air05_csv"
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        startcsv = time.time()
        multi_file_csv = h2o.import_file(hdfs_csv_file, na_strings=['\\N'])
        endcsv = time.time()

        csv_type_dict = multi_file_csv.types

        multi_file_csv.summary()
        csv_summary = h2o.frame(
            multi_file_csv.frame_id)["frames"][0]["columns"]

        col_ind_name = dict()
        # change column types from real to enum according to multi_file_csv column types
        for key_name in list(csv_type_dict):
            col_ind = key_name.split('C')
            new_ind = int(str(col_ind[1])) - 1
            col_ind_name[new_ind] = key_name

        col_types = []
        for ind in range(len(col_ind_name)):
            col_types.append(csv_type_dict[col_ind_name[ind]])

        startorc1 = time.time()
        multi_file_orc1 = h2o.import_file(url_orc)
        endorc1 = time.time()
        h2o.remove(multi_file_orc1)

        startorc = time.time()
        multi_file_orc = h2o.import_file(url_orc, col_types=col_types)
        endorc = time.time()

        multi_file_orc.summary()
        orc_summary = h2o.frame(
            multi_file_orc.frame_id)["frames"][0]["columns"]

        print("************** CSV parse time is {0}".format(endcsv - startcsv))
        print(
            "************** ORC (without column type forcing) parse time is {0}"
            .format(endorc1 - startorc1))
        print(
            "************** ORC (with column type forcing) parse time is {0}".
            format(endorc - startorc))
        # compare frame read by orc by forcing column type,
        pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
          "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/milsongs_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/milsongs_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            multi_file_csv = h2o.import_file(url_csv)
            multi_file_orc = h2o.import_file(url_orc)

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
Example #7
0
 def show(self, noprint=False):
   """
   Evaluate and print.
   :return:
   """
   self.eager()
   if noprint:
     if isinstance(self._data, unicode):
       j = h2o.frame(self._data)
       data = j['frames'][0]['columns'][0]['data'][0:10]
       return data
     return self._data
   else:
     if isinstance(self._data, unicode):
       j = h2o.frame(self._data)
       data = j['frames'][0]['columns'][0]['data'][0:10]
     elif isinstance(self._data, int):
       print self._data
       return
     else:
       data = [self._data]
     header = self._vecname + " (first " + str(len(data)) + " row(s))"
     rows = range(1, len(data) + 1, 1)
     print tabulate.tabulate(zip(rows, data), headers=["Row ID", header])
     print
Example #8
0
def additional_parameters():

    #col_types as list
    dest_frame="dev29&hex%"
    c_names = ["a", "b", "c"]
    c_types = ["enum", "enum", "string"]

    fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"),
                           destination_frame=dest_frame,
                           col_names=c_names,
                           col_types=c_types)
    fhex.describe()

    assert fhex._id == dest_frame.replace("%",".").replace("&",".")
    assert fhex.col_names == c_names
    col_summary = h2o.frame(fhex._id)["frames"][0]["columns"]
    for i in range(len(col_summary)):
        assert col_summary[i]["type"] == c_types[i]

    #col_types as dictionary
    dest_frame="dev29&hex%"
    c_names = ["a", "b", "c"]
    c_types = {"c":"string", "a":"enum", "b": "enum"}

    fhex = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"),
                           destination_frame=dest_frame,
                           col_names=c_names,
                           col_types=c_types)
    fhex.describe()

    assert fhex._id == dest_frame.replace("%",".").replace("&",".")
    assert fhex.col_names == c_names
    col_summary = h2o.frame(fhex._id)["frames"][0]["columns"]
    for i in range(len(col_summary)):
      assert col_summary[i]["type"] == c_types[c_names[i]]
Example #9
0
def import_folder():

    tol_time = 200  # comparing in ms or ns for timestamp columns
    tol_numeric = 1e-5  # tolerance for comparing other numeric fields
    numElements2Compare = 0  # choose number of elements per column to compare.  Save test time.

    multi_file_csv = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/parser/hexdev_497/airlines_first_header"))
    multi_file_gzip_comp = \
      h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip"))

    try:
        # make sure the two agrees
        assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time,
                                           tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \
                                                               "files are different!"
    except:  # in case the files are listed differently, we can always just check to see if the summary agrees.
        multi_file_gzip_comp.summary()
        zip_summary = h2o.frame(
            multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

        multi_file_csv.summary()
        csv_summary = h2o.frame(
            multi_file_csv.frame_id)["frames"][0]["columns"]
        pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
Example #10
0
  def show(self, noprint=False):
    """
    Evaluate and print.

    :return: None
    """
    self.eager()
    if noprint:
      if isinstance(self._data, unicode):
        j = h2o.frame(self._data)
        data = [c['data'] for c in j['frames'][0]['columns'][:]]
        data = map(list, zip(*data))
        return data[0:min(10,len(data))]
      return self._data
    else:
      if isinstance(self._data, unicode):
        j = h2o.frame(self._data)
        data = [c['data'] for c in j['frames'][0]['columns'][:]]
      elif isinstance(self._data, (int, float, str, list)):
        print self._data
        print
        return
      else:
        data = [self._data]
      t_data = map(list, zip(*data))
      t_data = t_data[0:min(10,len(t_data))]
      for didx,d in enumerate(t_data): t_data[didx].insert(0,didx)
      headers = ["Row ID"]
      for i in range(len(t_data[0])): headers.append('')
      print "Displaying first " + str(len(t_data)) + " row(s)"
      print tabulate.tabulate(t_data, headers=headers)
      print
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_orc_file = "/datasets/orc_parser/milsongs_orc"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        hdfs_csv_file = "/datasets/orc_parser/milsongs_csv"
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        multi_file_csv = h2o.import_file(url_csv)
        multi_file_orc = h2o.import_file(url_orc)

        multi_file_csv.summary()
        csv_summary = h2o.frame(
            multi_file_csv.frame_id)["frames"][0]["columns"]

        multi_file_orc.summary()
        orc_summary = h2o.frame(
            multi_file_orc.frame_id)["frames"][0]["columns"]

        pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
Example #12
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/milsongs_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/milsongs_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            multi_file_csv = h2o.import_file(url_csv)
            multi_file_orc = h2o.import_file(url_orc)

            multi_file_csv.summary()
            csv_summary = h2o.frame(
                multi_file_csv.frame_id)["frames"][0]["columns"]

            multi_file_orc.summary()
            orc_summary = h2o.frame(
                multi_file_orc.frame_id)["frames"][0]["columns"]

            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_airline_05_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/air05_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/air05_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            startcsv = time.time()
            multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'])
            endcsv = time.time()

            csv_type_dict = multi_file_csv.types

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            col_ind_name = dict()
            # change column types from real to enum according to multi_file_csv column types
            for key_name in list(csv_type_dict):
                col_ind = key_name.split('C')
                new_ind = int(str(col_ind[1]))-1
                col_ind_name[new_ind] = key_name

            col_types = []
            for ind in range(len(col_ind_name)):
                col_types.append(csv_type_dict[col_ind_name[ind]])

            startorc1 = time.time()
            multi_file_orc1 = h2o.import_file(url_orc)
            endorc1 = time.time()
            h2o.remove(multi_file_orc1)

            startorc = time.time()
            multi_file_orc = h2o.import_file(url_orc,col_types=col_types)
            endorc = time.time()

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            print("************** CSV parse time is {0}".format(endcsv-startcsv))
            print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1))
            print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc))
            # compare frame read by orc by forcing column type,
            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def import_folder():
  multi_file_csv = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header"))
  multi_file_gzip_comp = \
    h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip"))

  multi_file_gzip_comp.summary()
  zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

  multi_file_csv.summary()
  csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]
  pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
Example #15
0
def import_folder():
    """
    This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv
    from and build another H2O frame from the multi-file orc parser using multiple orc files that are
    saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc.  It will compare the two frames
    to make sure they are equal.
    :return: None if passed.  Otherwise, an exception will be thrown.
    """
    startcsv = time.time()
    multi_file_csv = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/parser/orc/pubdev_3200/air05_csv"),
                                     na_strings=['\\N'])
    endcsv = time.time()

    csv_type_dict = multi_file_csv.types

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

    col_ind_name = dict()
    # change column types from real to enum according to multi_file_csv column types
    for key_name in list(csv_type_dict):
        col_ind = key_name.split('C')
        new_ind = int(str(col_ind[1])) - 1
        col_ind_name[new_ind] = key_name

    col_types = []
    for ind in range(len(col_ind_name)):
        col_types.append(csv_type_dict[col_ind_name[ind]])

    startorc1 = time.time()
    multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/parser/orc/pubdev_3200/air05_orc"))
    endorc1 = time.time()
    h2o.remove(multi_file_orc1)

    startorc = time.time()
    multi_file_orc = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/parser/orc/pubdev_3200/air05_orc"),
                                     col_types=col_types)
    endorc = time.time()

    multi_file_orc.summary()
    orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

    print("************** CSV parse time is {0}".format(endcsv - startcsv))
    print("************** ORC (without column type forcing) parse time is {0}".
          format(endorc1 - startorc1))
    print("************** ORC (with column type forcing) parse time is {0}".
          format(endorc - startorc))
    # compare frame read by orc by forcing column type,
    pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
def import_folder():
    multi_file_csv = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/parser/hexdev_497/airlines_first_header"))
    multi_file_gzip_comp = \
      h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip"))

    multi_file_gzip_comp.summary()
    zip_summary = h2o.frame(
        multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]
    pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def parquet_parse_simple():
    """
    Tests Parquet parser by comparing the summary of the original csv frame with the h2o parsed Parquet frame.
    Basic use case of importing files with auto-detection of column types.
    :return: None if passed.  Otherwise, an exception will be thrown.
    """
    csv = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    parquet = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/airlines-simple.snappy.parquet"))

    csv.summary()
    csv_summary = h2o.frame(csv.frame_id)["frames"][0]["columns"]

    parquet.summary()
    parquet_summary = h2o.frame(parquet.frame_id)["frames"][0]["columns"]

    pyunit_utils.compare_frame_summary(csv_summary, parquet_summary)
Example #18
0
 def summary(self):
     self.eager()
     if self.is_local():
         x = self._data[0]
         t = 'int' if isinstance(
             x, int) else ('enum' if isinstance(x, str) else 'real')
         mins = [min(self._data)]
         maxs = [max(self._data)]
         n = len(self._data)
         mean = sum(self._data) / n if t != 'enum' else None
         ssq = 0
         zeros = 0
         missing = 0
         for x in self._data:
             delta = x - mean
             if t != 'enum': ssq += delta * delta
             if x == 0: zeros += 1
             if x is None or (t != 'enum' and isnan(x)): missing += 1
         stddev = sqrt(ssq / (n - 1)) if t != 'enum' else None
         return {
             'type': t,
             'mins': mins,
             'maxs': maxs,
             'mean': mean,
             'sigma': stddev,
             'zeros': zeros,
             'missing': missing
         }
     if self._summary: return self._summary
     j = h2o.frame(self._data)
     self._summary = j['frames'][0]['columns'][0]
     return self._summary
Example #19
0
  def ddply(self,cols,fun):
    """
    :param cols: Column names used to control grouping
    :param fun: Function to execute on each group.  Right now limited to textual Rapids expression
    :return: New frame with 1 row per-group, of results from 'fun'
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    # Confirm all names present in dataset; collect column indices
    rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")"

    # Eagerly eval and send the cbind'd frame over
    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()
    expr = "(= !{} (h2o.ddply %{} {} {}))".format(tmp_key,key,rapids_series,fun)
    h2o.rapids(expr) # ddply in h2o
    # Remove h2o temp frame after ddply
    h2o.remove(key)
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key) # Fetch the frame as JSON
    fr = j['frames'][0]    # Just the first (only) frame
    rows = fr['rows']      # Row count
    veckeys = fr['vec_ids']# List of h2o vec keys
    cols = fr['columns']   # List of columns
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
Example #20
0
  def cbind(self,data):
    """
    :param data: H2OFrame or H2OVec
    :return: new H2OFrame with data cbinded to the end
    """
    # Check data type
    vecs = []
    if isinstance(data,H2OFrame):
      vecs.append(self)
      [vecs.append(vec) for vec in data._vecs]
    elif isinstance(data,H2OVec):
      vecs = [self, data]
    else:
      raise ValueError("data parameter must be H2OVec or H2OFrame")
    names = [vec.name() for vec in vecs]

    fr = H2OFrame.py_tmp_key()
    cbind = "(= !" + fr + " (cbind %"
    cbind += " %".join([vec._expr.eager() for vec in vecs]) + "))"
    h2o.rapids(cbind)

    j = h2o.frame(fr)
    fr = j['frames'][0]
    rows = fr['rows']
    veckeys = fr['vec_ids']
    cols = fr['columns']
    colnames = [col['label'] for col in cols]
    result = H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
    result.setNames(names)
    return result
Example #21
0
  def predict(self, test_data):
    """
    Predict on a dataset.

    :param test_data: Data to be predicted on.
    :return: A new H2OFrame filled with predictions.
    """
    if not test_data: raise ValueError("Must specify test data")
    # cbind the test_data vecs together and produce a temp key
    test_data_key = H2OFrame.send_frame(test_data)
    # get the predictions
    # this job call is blocking
    j = H2OConnection.post_json("Predictions/models/" + self._key + "/frames/" + test_data_key)
    # toast the cbound frame
    h2o.removeFrameShallow(test_data_key)
    # retrieve the prediction frame
    prediction_frame_key = j["model_metrics"][0]["predictions"]["frame_id"]["name"]
    # get the actual frame meta dta
    pred_frame_meta = h2o.frame(prediction_frame_key)["frames"][0]
    # toast the prediction frame
    h2o.removeFrameShallow(prediction_frame_key)
    # collect the vec_ids
    vec_ids = pred_frame_meta["vec_ids"]
    # get the number of rows
    rows = pred_frame_meta["rows"]
    # get the column names
    cols = [col["label"] for col in pred_frame_meta["columns"]]
    # create a set of H2OVec objects
    vecs = H2OVec.new_vecs(zip(cols, vec_ids), rows)
    # return a new H2OFrame object
    return H2OFrame(vecs=vecs)
Example #22
0
  def quantile(self, prob=None, combine_method="interpolate"):
    """
    Compute quantiles over a given H2OFrame.

    :param prob: A list of probabilties, default is [0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]. You may provide any sequence of any length.
    :param combine_method: For even samples, how to combine quantiles. Should be one of ["interpolate", "average", "low", "hi"]
    :return: an H2OFrame containing the quantiles and probabilities.
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    if len(self) == 0: return self
    if not prob: prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]
    if not isinstance(prob, list): raise ValueError("prob must be a list")
    probs = "(dlist #"+" #".join([str(p) for p in prob])+")"
    if combine_method not in ["interpolate","average","low","high"]:
      raise ValueError("combine_method must be one of: [" + ",".join(["interpolate","average","low","high"])+"]")

    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()
    expr = "(= !{} (quantile '{}' {} '{}'".format(tmp_key,key,probs,combine_method)
    h2o.rapids(expr)
    # Remove h2o temp frame after groupby
    h2o.remove(key)
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key)
    fr = j['frames'][0]       # Just the first (only) frame
    rows = fr['rows']         # Row count
    veckeys = fr['vec_ids']  # List of h2o vec keys
    cols = fr['columns']      # List of columns
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
Example #23
0
  def __getitem__(self, i):
    if isinstance(i, int):   return self._vecs[i]
    if isinstance(i, str):   return self._find(i)
    # Slice; return a Frame not a Vec
    if isinstance(i, slice): return H2OFrame(vecs=self._vecs[i])
    # Row selection from a boolean Vec
    if isinstance(i, H2OVec):
      self._len_check(i)
      return H2OFrame(vecs=[x.row_select(i) for x in self._vecs])

    # have a list/tuple of numbers or strings
    if isinstance(i, list) or (isinstance(i, tuple) and len(i) != 2):
      vecs = []
      for it in i:
        if isinstance(it, int):    vecs.append(self._vecs[it])
        elif isinstance(it, str):  vecs.append(self._find(it))
        else:                      raise NotImplementedError
      return H2OFrame(vecs=vecs)

    # multi-dimensional slicing via 2-tuple
    if isinstance(i, tuple):
      j = h2o.frame(self.send_frame())
      fr = j['frames'][0]
      veckeys = [str(v['name']) for v in fr['vec_keys']]
      left = Expr(veckeys)
      rite = Expr((i[0], i[1]))
      return Expr("[", left, rite, length=2)

    raise NotImplementedError("Slicing by unknown type: "+str(type(i)))
Example #24
0
    def predict(self, test_data):
        """
    Predict on a dataset.

    :param test_data: Data to be predicted on.
    :return: A new H2OFrame filled with predictions.
    """
        if not test_data: raise ValueError("Must specify test data")
        # cbind the test_data vecs together and produce a temp key
        test_data_key = H2OFrame.send_frame(test_data)
        # get the predictions
        # this job call is blocking
        j = H2OConnection.post_json("Predictions/models/" + self._key +
                                    "/frames/" + test_data_key)
        # retrieve the prediction frame
        prediction_frame_key = j["model_metrics"][0]["predictions"][
            "frame_id"]["name"]
        # get the actual frame meta dta
        pred_frame_meta = h2o.frame(prediction_frame_key)["frames"][0]
        # collect the vec_ids
        vec_ids = pred_frame_meta["vec_ids"]
        # get the number of rows
        rows = pred_frame_meta["rows"]
        # get the column names
        cols = [col["label"] for col in pred_frame_meta["columns"]]
        # create a set of H2OVec objects
        vecs = H2OVec.new_vecs(zip(cols, vec_ids), rows)
        # toast the cbound frame
        h2o.delete(test_data_key)
        # return a new H2OFrame object
        return H2OFrame(vecs=vecs)
Example #25
0
  def describe(self):
    """
    Generate an in-depth description of this H2OFrame.

    The description is a tabular print of the type, min, max, sigma, number of zeros,
    and number of missing elements for each H2OVec in this H2OFrame.

    :return: None (print to stdout)
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    print "Rows:", len(self._vecs[0]), "Cols:", len(self)
    headers = [vec._name for vec in self._vecs]
    table = [
      self._row('type', None),
      self._row('mins', 0),
      self._row('mean', None),
      self._row('maxs', 0),
      self._row('sigma', None),
      self._row('zero_count', None),
      self._row('missing_count', None)
    ]

    chunk_summary_tmp_key = H2OFrame.send_frame(self)

    chunk_summary = h2o.frame(chunk_summary_tmp_key)["frames"][0]["chunk_summary"]

    h2o.remove(chunk_summary_tmp_key)

    print tabulate.tabulate(table, headers)
    print
    print chunk_summary
    print
Example #26
0
    def deepfeatures(self, test_data, layer):
        """
    Return hidden layer details

    :param test_data: Data to create a feature space on
    :param layer: 0 index hidden layer
    """
        if not test_data: raise ValueError("Must specify test data")
        # create test_data by cbinding vecs
        test_data_key = H2OFrame.send_frame(test_data)
        # get the deepfeatures of the dataset
        j = H2OConnection.post_json("Predictions/models/" + self._key +
                                    "/frames/" + test_data_key,
                                    deep_features_hidden_layer=layer)
        # retreive the frame data
        deepfeatures_frame_key = j["predictions_frame"]["name"]
        df_frame_meta = h2o.frame(deepfeatures_frame_key)["frames"][0]
        # create vecs by extracting vec_ids, col length, and col names
        vec_ids = df_frame_meta["vec_ids"]
        rows = df_frame_meta["rows"]
        cols = [col["label"] for col in df_frame_meta["columns"]]
        vecs = H2OVec.new_vecs(zip(cols, vec_ids), rows)
        # remove test data from kv
        h2o.delete(test_data_key)
        # finally return frame
        return H2OFrame(vecs=vecs)
def import_folder():
    """
    This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv
    from and build another H2O frame from the multi-file orc parser using multiple orc files that are
    saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc.  It will compare the two frames
    to make sure they are equal.
    :return: None if passed.  Otherwise, an exception will be thrown.
    """
    startcsv = time.time()
    multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_csv"),
                                     na_strings=['\\N'])
    endcsv = time.time()

    csv_type_dict = multi_file_csv.types

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

    col_ind_name = dict()
    # change column types from real to enum according to multi_file_csv column types
    for key_name in list(csv_type_dict):
        col_ind = key_name.split('C')
        new_ind = int(str(col_ind[1]))-1
        col_ind_name[new_ind] = key_name

    col_types = []
    for ind in range(len(col_ind_name)):
        col_types.append(csv_type_dict[col_ind_name[ind]])

    startorc1 = time.time()
    multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc"))
    endorc1 = time.time()
    h2o.remove(multi_file_orc1)

    startorc = time.time()
    multi_file_orc = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc"),
                                     col_types=col_types)
    endorc = time.time()

    multi_file_orc.summary()
    orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

    print("************** CSV parse time is {0}".format(endcsv-startcsv))
    print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1))
    print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc))
    # compare frame read by orc by forcing column type,
    pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
Example #28
0
def parquet_parse_simple():
    """
    Tests Parquet parser by comparing the summary of the original csv frame with the h2o parsed Parquet frame.
    Basic use case of importing files with auto-detection of column types.
    :return: None if passed.  Otherwise, an exception will be thrown.
    """
    csv = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    parquet = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/parser/parquet/airlines-simple.snappy.parquet"))

    csv.summary()
    csv_summary = h2o.frame(csv.frame_id)["frames"][0]["columns"]

    parquet.summary()
    parquet_summary = h2o.frame(parquet.frame_id)["frames"][0]["columns"]

    pyunit_utils.compare_frame_summary(csv_summary, parquet_summary)
Example #29
0
def import_folder():
    """
    This test will build a H2O frame from importing the bigdata/laptop/parser/orc/milsongs_orc_csv
    from and build another H2O frame from the multi-file orc parser using multiple orc files that are
    saved in the directory bigdata/laptop/parser/orc/milsongs_orc.  It will compare the two frames
    to make sure they are equal.
    :return: None if passed.  Otherwise, an exception will be thrown.
    """
    multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/milsongs_orc_csv"))
    multi_file_orc = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/milsongs_orc"))

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

    multi_file_orc.summary()
    orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

    pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
Example #30
0
def h2oframe():
    """
    Python API test: h2o.frame(frame_id)
    """
    training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
    frame_summary = h2o.frame(training_data.frame_id)
    assert_is_type(frame_summary, H2OResponse)
    assert frame_summary["frames"][0]['rows']==training_data.nrow, "h2o.frame() command is not working."
    assert frame_summary["frames"][0]['column_count']==training_data.ncol, "h2o.frame() command is not working."
Example #31
0
 def dim(self):
   """
   Eagerly evaluate the Expr. If it's an H2OFrame, return the number of rows and columns.
   :return: The number of rows and columns in the H2OFrame as a list [rows, cols].
   """
   self.eager()
   if isinstance(self._data, unicode):
     frame = h2o.frame(self._data)
     return [frame['frames'][0]['rows'], len(frame['frames'][0]['columns'])]
   raise ValueError("data must be a (unicode) key")
Example #32
0
 def row_select(self, vec):
   """
   Boolean column select lookup
   :param vec: An H2OVec.
   :return: A new H2OVec.
   """
   e = Expr("[", self, vec)
   j = h2o.frame(e.eager())
   e.set_len(j['frames'][0]['rows'])
   return H2OVec(self._name, e)
Example #33
0
 def dim(self):
   """
   Eagerly evaluate the Expr. If it's an H2OFrame, return the number of rows and columns.
   :return: The number of rows and columns in the H2OFrame as a list [rows, cols].
   """
   self.eager()
   if isinstance(self._data, unicode):
     frame = h2o.frame(self._data)
     return [frame['frames'][0]['rows'], len(frame['frames'][0]['columns'])]
   raise ValueError("data must be a (unicode) key")
Example #34
0
    def show(self, noprint=False):
        """
    Evaluate and print.

    :return: None
    """
        self.eager()
        if noprint:
            if isinstance(self._data, unicode):
                j = h2o.frame(self._data)
                data = [
                    c['data'] if c['type'] != "string" else c["string_data"]
                    for c in j['frames'][0]['columns'][:]
                ]
                domains = [c['domain'] for c in j['frames'][0]['columns']]
                for i in range(len(data)):
                    if domains[i] is not None:
                        for j in range(len(data[i])):
                            if data[i][j] == "NaN": continue
                            data[i][j] = domains[i][int(data[i][j])]
                data = map(list, zip(*data))
                return data[0:min(10, len(data))]
            return self._data
        else:
            if isinstance(self._data, unicode):
                j = h2o.frame(self._data)
                data = [c['data'] for c in j['frames'][0]['columns'][:]]
            elif isinstance(self._data, (int, float, str, list)):
                print self._data
                print
                return
            else:
                data = [self._data]
            t_data = map(list, zip(*data))
            t_data = t_data[0:min(10, len(t_data))]
            for didx, d in enumerate(t_data):
                t_data[didx].insert(0, didx)
            headers = ["Row ID"]
            for i in range(len(t_data[0])):
                headers.append('')
            print "Displaying first " + str(len(t_data)) + " row(s)"
            h2o.H2ODisplay(t_data, headers)
Example #35
0
def parquet_parse_dates():
    parquet_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/parser/parquet/parquet-file-with-date-column.snappy.parquet"
    ))

    parquet_data.summary()
    parquet_summary = h2o.frame(parquet_data.frame_id)["frames"][0]["columns"]
    date_converted_column_type = parquet_summary[2]['type']
    assert date_converted_column_type == "time"

    date_string_rows = parquet_data[:, "date_string"]
    date_converted_rows = parquet_data[:, "date_converted"]
    pyunit_utils.compare_frames(date_string_rows, date_converted_rows, 1)
Example #36
0
  def dim(self):
    """
    Eagerly evaluate the Expr. If it's an H2OFrame, return the number of rows and columns.

    :return: The number of rows and columns in the H2OFrame as a list [rows, cols].
    """
    self.eager()
    if self.is_remote(): # potentially big data
      frame = h2o.frame(self._data)
      return [frame['frames'][0]['rows'], len(frame['frames'][0]['columns'])]
    elif self.is_local(): # small data
      return [1,1] if not hasattr(self._data, '__len__') else [1,len(self._data)]
    raise ValueError("data must be local or remote")
def import_folder():

  tol_time = 200              # comparing in ms or ns for timestamp columns
  tol_numeric = 1e-5          # tolerance for comparing other numeric fields
  numElements2Compare = 0   # choose number of elements per column to compare.  Save test time.

  multi_file_csv = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header"))
  multi_file_gzip_comp = \
    h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip"))

  try:
    # make sure the two agrees
    assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time,
                                       tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \
                                                           "files are different!"
  except: # in case the files are listed differently, we can always just check to see if the summary agrees.
    multi_file_gzip_comp.summary()
    zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]
    pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
Example #38
0
  def dim(self):
    """
    Eagerly evaluate the Expr. If it's an H2OFrame, return the number of rows and columns.

    :return: The number of rows and columns in the H2OFrame as a list [rows, cols].
    """
    self.eager()
    if self.is_remote(): # potentially big data
      frame = h2o.frame(self._data)
      return [frame['frames'][0]['rows'], len(frame['frames'][0]['columns'])]
    elif self.is_local(): # small data
      return [1,1] if not hasattr(self._data, '__len__') else [1,len(self._data)]
    raise ValueError("data must be local or remote")
Example #39
0
  def show(self, noprint=False):
    """
    Evaluate and print.

    :return: None
    """
    self.eager()
    if noprint:
      if isinstance(self._data, unicode):
        j = h2o.frame(self._data)
        data = [c['data'] if c['type']!="string" else c["string_data"] for c in j['frames'][0]['columns'][:]]
        domains  = [c['domain'] for c in j['frames'][0]['columns']]
        for i in range(len(data)):
          if domains[i] is not None:
            for j in range(len(data[i])):
              if data[i][j] == "NaN": continue
              data[i][j] = domains[i][int(data[i][j])]
        data = map(list, zip(*data))
        return data[0:min(10,len(data))]
      return self._data
    else:
      if isinstance(self._data, unicode):
        j = h2o.frame(self._data)
        data = [c['data'] for c in j['frames'][0]['columns'][:]]
      elif isinstance(self._data, (int, float, str, list)):
        print self._data
        print
        return
      else:
        data = [self._data]
      t_data = map(list, zip(*data))
      t_data = t_data[0:min(10,len(t_data))]
      for didx,d in enumerate(t_data): t_data[didx].insert(0,didx)
      headers = ["Row ID"]
      for i in range(len(t_data[0])): headers.append('')
      print "Displaying first " + str(len(t_data)) + " row(s)"
      h2o.H2ODisplay(t_data,headers)
Example #40
0
  def row_select(self, vec):
    """
    Boolean column select lookup

    :param vec: An H2OVec.
    :return: A new H2OVec.
    """
    e = Expr("[", self, vec)
    r = e.eager()
    if isinstance(r, (float,int)):
      e.set_len(1)
    else:
      j = h2o.frame(r)
      e.set_len(j['frames'][0]['rows'])
    return H2OVec(self._name, e)
def import_folder():

  tol_time = 200              # comparing in ms or ns
  tol_numeric = 1e-5          # tolerance for comparing other numeric fields
  numElements2Compare = 100   # choose number of elements per column to compare.  Save test time.

  # compressed the whole directory of files.
  multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/hexdev_497/milsongs_csv.zip"))

  # directory containing the gzip version of csv files here.
  multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/hexdev_497/milsongs_csv_gzip"))

  try:
    # make sure the two agrees
    assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time,
                                       tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \
                                                           "files are different!"
  except: # in case the files are listed differently, we can always just check to see if the summary agrees.
    multi_file_gzip_comp.summary()
    zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]
    pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
Example #42
0
  def group_by(self,cols,a):
    """
    GroupBy
    :param cols: The columns to group on.
    :param a: A dictionary of aggregates having the following shape: \
    {"colname":[aggregate, column, naMethod]}\
    e.g.: {"bikes":["count", 0, "all"]}\

    The naMethod is one of "all", "ignore", or "rm", which specifies how to handle
    NAs that appear in columns that are being aggregated.

    "all" - include NAs
    "rm"  - exclude NAs
    "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.)
    :return: The group by frame.
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")"
    aggregates = copy.deepcopy(a)
    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()

    aggs = []

    # transform cols in aggregates to their indices...
    for k in aggregates:
      if isinstance(aggregates[k][1],str):
        aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1]))
      else:
        aggregates[k][1] = '#'+str(aggregates[k][1])
      aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])]
    aggs = "(agg {})".format(" ".join(aggs))

    expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs)
    h2o.rapids(expr)  # group by
    # Remove h2o temp frame after groupby
    h2o.delete(key)
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key)
    fr = j['frames'][0]       # Just the first (only) frame
    rows = fr['rows']         # Row count
    veckeys = fr['vec_ids']  # List of h2o vec keys
    cols = fr['columns']      # List of columns
    colnames = [col['label'] for col in cols]
    vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows) # Peel the Vecs out of the returned Frame
    h2o.delete(tmp_key)
    return H2OFrame(vecs=vecs)
Example #43
0
 def quantile(self, prob=None):
   if len(self) == 0: return self
   if not prob: prob=[0.01,0.1,0.25,0.333,0.5,0.667,0.75,0.9,0.99]
   if not isinstance(prob, list): raise ValueError("prob must be a list")
   probs = "(dlist #"+" #".join([str(p) for p in prob])+")"
   key = self.send_frame()
   tmp_key = H2OFrame.py_tmp_key()
   expr = "(= !{} (quantile '{}' {}".format(tmp_key, key, probs)
   h2o.rapids(expr)
   j = h2o.frame(tmp_key)
   fr = j['frames'][0]       # Just the first (only) frame
   rows = fr['rows']         # Row count
   veckeys = fr['vec_keys']  # List of h2o vec keys
   cols = fr['columns']      # List of columns
   colnames = [col['label'] for col in cols]
   return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
def additional_parameters(ip, port):
    dest_frame="29&devhex%"
    c_names = ["a", "b", "c"]
    c_types = ["enum", "enum", "enum"]

    fhex = h2o.import_file(h2o.locate("smalldata/jira/hexdev_29.csv"),
                           destination_frame=dest_frame,
                           col_names=c_names,
                           col_types=c_types)
    fhex.describe()

    assert fhex._id == dest_frame.replace("%",".").replace("&",".")
    assert fhex._col_names == c_names
    col_summary = h2o.frame(fhex._id)["frames"][0]["columns"]
    for i in range(len(col_summary)):
        assert col_summary[i]["type"] == c_types[i]
Example #45
0
 def var(self):
   """
   :return: The covariance matrix of the columns in this H2OFrame.
   """
   key = self.send_frame()
   tmp_key = H2OFrame.py_tmp_key()
   expr = "(= !{} (var %{} \"null\" %FALSE \"everything\"))".format(tmp_key,key)
   h2o.rapids(expr)
   # Remove h2o temp frame after var
   h2o.remove(key)
   j = h2o.frame(tmp_key)
   fr = j['frames'][0]
   rows = fr['rows']
   veckeys = fr['vec_keys']
   cols = fr['columns']
   colnames = [col['label'] for col in cols]
   return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
Example #46
0
  def group_by(self,cols,a):
    """
    GroupBy
    :param cols: The columns to group on.
    :param a: A dictionary of aggregates having the following shape: \
    {"colname":[aggregate, column, naMethod]}\
    e.g.: {"bikes":["count", 0, "all"]}\

    The naMethod is one of "all", "ignore", or "rm", which specifies how to handle
    NAs that appear in columns that are being aggregated.

    "all" - include NAs
    "rm"  - exclude NAs
    "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.)
    :return: The group by frame.
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    rapids_series = "(llist #"+" #".join([str(self._find_idx(name)) for name in cols])+")"
    aggregates = copy.deepcopy(a)
    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()

    aggs = []

    # transform cols in aggregates to their indices...
    for k in aggregates:
      if isinstance(aggregates[k][1],str):
        aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1]))
      else:
        aggregates[k][1] = '#'+str(aggregates[k][1])
      aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])]
    aggs = "(agg {})".format(" ".join(aggs))

    expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs)
    h2o.rapids(expr)  # group by
    # Remove h2o temp frame after groupby
    h2o.remove(key)
    # Make backing H2OVecs for the remote h2o vecs
    j = h2o.frame(tmp_key)
    fr = j['frames'][0]       # Just the first (only) frame
    rows = fr['rows']         # Row count
    veckeys = fr['vec_ids']  # List of h2o vec keys
    cols = fr['columns']      # List of columns
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
Example #47
0
 def biases(self, vector_id=0):
   """
   Return the frame for the respective bias vector
   :param: vector_id: an integer, ranging from 0 to number of layers, that specifies the bias vector to return.
   :return: an H2OFrame which represents the bias vector identified by vector_id
   """
   num_bias_vectors = len(self._model_json['output']['biases'])
   if vector_id not in range(num_bias_vectors):
     raise ValueError("Bias vector does not exist. Model has {0} bias vectors (0-based indexing), but vector {1} "
                      "was requested.".format(num_bias_vectors, vector_id))
   j = h2o.frame(self._model_json['output']['biases'][vector_id]['URL'].split('/')[3])
   fr = j['frames'][0]
   rows = fr['rows']
   vec_ids = fr['vec_ids']
   cols = fr['columns']
   colnames = [col['label'] for col in cols]
   result = H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, vec_ids), rows))
   return result
Example #48
0
 def weights(self, matrix_id=0):
   """
   Return the frame for the respective weight matrix
   :param: matrix_id: an integer, ranging from 0 to number of layers, that specifies the weight matrix to return.
   :return: an H2OFrame which represents the weight matrix identified by matrix_id
   """
   num_weight_matrices = len(self._model_json['output']['weights'])
   if matrix_id not in range(num_weight_matrices):
     raise ValueError("Weight matrix does not exist. Model has {0} weight matrices (0-based indexing), but matrix {1} "
                      "was requested.".format(num_weight_matrices, matrix_id))
   j = h2o.frame(self._model_json['output']['weights'][matrix_id]['URL'].split('/')[3])
   fr = j['frames'][0]
   rows = fr['rows']
   vec_ids = fr['vec_ids']
   cols = fr['columns']
   colnames = [col['label'] for col in cols]
   result = H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, vec_ids), rows))
   return result
Example #49
0
  def groupby(self,cols,a):
    """
    GroupBy
    :param cols: The columns to group on.
    :param a: A dictionary of aggregates having the following shape:
              {"colname":[aggregate, column, naMethod]}
              e.g.: {"bikes":["count", 0, "all"]}

              The naMethod is one of "all", "ignore", or "rm", which specifies how to handle
              NAs that appear in columns that are being aggregated.

              "all" - include NAs
              "rm"  - exclude NAs
              "ignore" - ignore NAs in aggregates, but count them (e.g. in denominators for mean, var, sd, etc.)
    :return: The group by frame.
    """
    colnums = [str(self._find_idx(name)) for name in cols]
    rapids_series = "{"+";".join(colnums)+"}"
    aggregates = copy.deepcopy(a)
    key = self.send_frame()
    tmp_key = H2OFrame.py_tmp_key()

    nAggs = len(aggregates)
    aggs = []

    # transform cols in aggregates to their indices...
    for k in aggregates:
      if isinstance(aggregates[k][1],str):
        aggregates[k][1] = '#'+str(self._find_idx(aggregates[k][1]))
      else:
        aggregates[k][1] = '#'+str(aggregates[k][1])
      aggs+=["\"{1}\" {2} \"{3}\" \"{0}\"".format(str(k),*aggregates[k])]
    aggs = "(agg #{} {})".format(nAggs, " ".join(aggs))

    expr = "(= !{} (GB %{} {} {}))".format(tmp_key,key,rapids_series,aggs)
    h2o.rapids(expr)  # group by
    j = h2o.frame(tmp_key)
    fr = j['frames'][0]    # Just the first (only) frame
    rows = fr['rows']      # Row count
    veckeys = fr['vec_keys']# List of h2o vec keys
    cols = fr['columns']   # List of columns
    colnames = [col['label'] for col in cols]
    return H2OFrame(vecs=H2OVec.new_vecs(zip(colnames, veckeys), rows))
Example #50
0
  def __getitem__(self, i):
    """
    Column selection via integer, string(name)
    Column selection via slice returns a subset of the H2OFrame

    :param i: An int, str, slice, H2OVec, or list/tuple
    :return: An H2OVec, an H2OFrame, or scalar depending on the input slice.
    """
    if self._vecs is None or self._vecs == []:
      raise ValueError("Frame Removed")
    if isinstance(i, int):   return self._vecs[i]
    if isinstance(i, str):   return self._find(i)
    # Slice; return a Frame not a Vec
    if isinstance(i, slice): return H2OFrame(vecs=self._vecs[i])
    # Row selection from a boolean Vec
    if isinstance(i, H2OVec):
      self._len_check(i)
      return H2OFrame(vecs=[x.row_select(i) for x in self._vecs])

    # have a list/tuple of numbers or strings
    if isinstance(i, list) or (isinstance(i, tuple) and len(i) != 2):
      vecs = []
      for it in i:
        if isinstance(it, int):    vecs.append(self._vecs[it])
        elif isinstance(it, str):  vecs.append(self._find(it))
        else:                      raise NotImplementedError
      return H2OFrame(vecs=vecs)

    # multi-dimensional slicing via 2-tuple
    if isinstance(i, tuple):
      veckeys = [str(v._expr._data) for v in self._vecs]
      left = Expr(veckeys)
      rite = Expr((i[0], i[1]))
      res = Expr("[", left, rite, length=2)
      if not isinstance(i[0], int) or not isinstance(i[1], int): return res # possible big data
      # small data (single value)
      res.eager()
      if res.is_local(): return res._data
      j = h2o.frame(res._data) # data is remote
      return map(list, zip(*[c['data'] for c in j['frames'][0]['columns'][:]]))[0][0]

    raise NotImplementedError("Slicing by unknown type: "+str(type(i)))