Ejemplo n.º 1
0
def pubdev_6339():
    
    cluster = h2o.cluster()
    # number of nodes
    cloud_size = cluster.cloud_size
    # number of CPUs
    cores = sum(node["num_cpus"] for node in cluster.nodes)


    # path to file
    file_paths = [
        pyunit_utils.locate("smalldata/arcene/arcene_train.data"),
        pyunit_utils.locate("smalldata/census_income/adult_data.csv"),
        pyunit_utils.locate("smalldata/chicago/chicagoAllWeather.csv"),
        pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"),
        pyunit_utils.locate("smalldata/wa_cannabis/raw/Dashboard_Usable_Sales_w_Weight_Daily.csv")
    ]

    for file_path in file_paths:
        # read data and parse setup to get number of columns 
        data_raw = h2o.import_file(path=file_path,parse=False)
        setup = h2o.parse_setup(data_raw)

        # get number of columns from setup
        num_cols = setup['number_columns']
        # get the chunk size
        chunk_size = calculate_chunk_size(file_path, num_cols, cores, cloud_size)
    
        # get chunk size to compare if calculation is correct
        result_size = setup['chunk_size']
        assert chunk_size == result_size, "Calculated chunk size is incorrect!"
        print("chunk size for file", file_path, "is:", chunk_size)

    data_raw = h2o.import_file(path=file_paths[1],parse=False)
    setup = h2o.parse_setup(data_raw)
Ejemplo n.º 2
0
def import_svmlight(path, headers=""):
    raw = h2o.lazy_import(path)
    if settings.debug and len(headers) < 100:
        print utils.time() + "import with headers: " + str(headers)
    #parsesetup = h2o.parse_setup(raw,column_names=headers)
    parsesetup = h2o.parse_setup(
        raw
    )  # Issue: H2O 3.8 tests length of header vs. columns, but still imports the "pseudotarget" additionally
    parsesetup['parse_type'] = 'SVMLight'
    loaded_frame = h2o.parse_raw(parsesetup)
    if settings.debug:
        print "......HEader length: " + str(len(headers))
        print "......Frame imported: " + str(loaded_frame.ncol)
    if (len(headers) > loaded_frame.ncol):
        n = len(headers) - loaded_frame.ncol
        print "Remove last " + str(n) + " header entries"
        del headers[-n:]
    loaded_frame.set_names(headers)  #Workaround, Set names now
    print "First column: " + loaded_frame.names[
        0]  #needed because lazy name setting
    if settings.debug and len(headers) < 100: loaded_frame.head(show=True)
    loaded_frame.pop(0)  #remove first ('pseudotarget') columnn
    #if loaded_frame.ncol>len(headers)-1: #workaround: H2O reads info from svmlight into columns -> remove everything that is not in headers
    #    delete = []
    #    for i in xrange(len(headers)-1,loaded_frame.ncol):
    #        delete.append(loaded_frame.names[i])
    #    loaded_frame = remove_vecs(loaded_frame,delete)
    if settings.debug and len(headers) < 100: loaded_frame.head(show=True)
    return loaded_frame
Ejemplo n.º 3
0
    def _handle_raw_fname(self, raw_fname, column_names=None):
        """
        Handle result of upload_file
        :param raw_fname: A raw key
        :return: Part of the H2OFrame constructor.
        """

        # perform the parse setup
        setup = h2o.parse_setup(raw_fname)

        # blocking parse, first line is always a header (since "we" wrote the data out)
        parse = h2o.parse(setup, H2OFrame.py_tmp_key(), first_line_is_header=1)

        # a hack to get the column names correct since "parse" does not provide them
        if column_names and not parse["columnNames"]:
            cols = column_names
        else:
            cols = parse['columnNames']

        # set the rows
        rows = parse['rows']

        # set the vector keys
        veckeys = parse['vecKeys']

        # create a new vec[] array
        self._vecs = H2OVec.new_vecs(zip(cols, veckeys), rows)

        # print some information on the *uploaded* data
        print "Uploaded", raw_fname, "into cluster with", \
            rows, "rows and", len(cols), "cols"
        print
Ejemplo n.º 4
0
def h2oparse_setup():
    """
    Python API test: h2o.parse_setup(raw_frames, destination_frame=None, header=0, separator=None, column_names=None,
     column_types=None, na_strings=None)
    """
    try:
        col_types = [
            'enum', 'numeric', 'enum', 'enum', 'enum', 'numeric', 'numeric',
            'numeric'
        ]
        col_headers = [
            "CAPSULE", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
        ]
        hex_key = "training_data.hex"

        fraw = h2o.import_file(
            pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"),
            parse=False)
        setup = h2o.parse_setup(fraw,
                                destination_frame=hex_key,
                                header=1,
                                separator=',',
                                column_names=col_headers,
                                column_types=col_types,
                                na_strings=["NA"])
        assert_is_type(setup, H2OResponse)
        assert setup["number_columns"] == len(
            col_headers), "h2o.parse_setup() command is not working."
    except Exception as e:
        assert False, "h2o.parse_setup() command is not working."
Ejemplo n.º 5
0
def parse_false():

    fraw = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), parse=False)
    assert isinstance(fraw, list)

    fhex = h2o.parse_raw(h2o.parse_setup(fraw))
    fhex.summary()
    assert fhex.__class__.__name__ == "H2OFrame"
Ejemplo n.º 6
0
def parse_false():

    fraw = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"),
                           parse=False)
    assert isinstance(fraw, list)

    fhex = h2o.parse_raw(h2o.parse_setup(fraw))
    fhex.summary()
    assert fhex.__class__.__name__ == "H2OFrame"
    def get_data(self, src_bucket="cargo.ml.training", obj_name="training_sample.csv"):
        # boto3.setup_default_session(region_name='us-west-2')
        # s3_client = boto3.client('s3', aws_access_key_id=ACCESS_KID, aws_secret_access_key=ACCESS_KEY)
        input_path = os.path.join(INPUT_PATH, 'training_sample_input/training_sample.csv')
        # s3_client.download_file(src_bucket, obj_name, input_path)

        df_raw = h2o.import_file(input_path, parse=False)
        setup = h2o.parse_setup(df_raw,
                                destination_frame="training.hex",
                                header=1,
                                column_names=self.col_headers,
                                column_types=self.col_types)
        df = h2o.parse_raw(h2o.parse_setup(df_raw),
                           id='training.csv',
                           first_line_is_header=1)

        print("Input dataframe: ", df)
        return df
Ejemplo n.º 8
0
 def fromRawText(text_key, check_header=None):
   """
   Handle result of upload_file
   :param text_key: A key pointing to raw text to be parsed
   :return: Part of the H2OFrame constructor.
   """
   setup = h2o.parse_setup(text_key)
   if check_header is not None: setup["check_header"] = check_header
   res = H2OFrame._parse_raw(setup)
   print "Uploaded {} into cluster with {:,} rows and {:,} cols".format(text_key, res.nrow, res.ncol)
   return res
Ejemplo n.º 9
0
def h2oparse_raw():
    """
    Python API test: h2o.parse_raw(setup, id=None, first_line_is_header=0)

    copied from pyunit_hexdev_29_parse_false.py
    """
    fraw = h2o.import_file(pyunit_utils.locate("smalldata/jira/hexdev_29.csv"), parse=False)
    assert isinstance(fraw, list)

    fhex = h2o.parse_raw(h2o.parse_setup(fraw), id='hexdev_29.hex', first_line_is_header=0)
    fhex.summary()
    assert_is_type(fhex, H2OFrame)
Ejemplo n.º 10
0
def hexdev_394():
  path = tests.locate("smalldata/covtype/covtype.20k.data")
  trainraw = h2o.lazy_import(path)
  tsetup = h2o.parse_setup(trainraw)
  tsetup["column_types"][10] = "ENUM"
  tsetup["column_types"][11] = "ENUM"
  tsetup["column_types"][12] = "ENUM"
  train = h2o.parse_raw(tsetup)
  
  cols = train.col_names  # This returned space for first column name
  x_cols = [colname for colname in cols if colname != "C55"]
  x_cols
  
  
  splits = train.split_frame()
  newtrain = splits[0]
  newvalid = splits[1]
  newtrain_x = newtrain[x_cols]
  newtrain_y = newtrain[54].asfactor()
  newvalid_x = newvalid[x_cols]
  newvalid_y = newvalid[54].asfactor()
  
  
  my_gbm = h2o.gbm(y=newtrain_y,
                   validation_y=newvalid_y,
                   x=newtrain_x,
                   validation_x=newvalid_x,
                   distribution =  "multinomial",
                   ntrees=100,
                   learn_rate=0.1,
                   max_depth=6)
  
  split1, split2 = train.split_frame()
  
  newtrain_x = split1[x_cols]
  newtrain_y = split1[54].asfactor()
  newvalid_x = split2[x_cols]
  newvalid_y = split2[54].asfactor()
  
  my_gbm = h2o.gbm(y=newtrain_y,
                   validation_y=newvalid_y,
                   x=newtrain_x,
                   validation_x=newvalid_x,
                   distribution = "multinomial",
                   ntrees=100,
                   learn_rate=0.1,
                   max_depth=6) 

  print "KEEPING FRAME???"
  print train._keep
Ejemplo n.º 11
0
def continuous_or_categorical(ip, port):
    fraw = h2o.import_file(h2o.locate("smalldata/jira/hexdev_29.csv"))
    fsetup = h2o.parse_setup(fraw)
    fsetup["column_types"][0] = "ENUM"
    fsetup["column_types"][1] = "ENUM"
    fsetup["column_types"][2] = "ENUM"

    df_hex = h2o.parse_raw(fsetup)

    df_hex.summary()

    assert df_hex["h1"].isfactor()
    assert df_hex["h2"].isfactor()
    assert df_hex["h3"].isfactor()
Ejemplo n.º 12
0
def continuous_or_categorical():
    fraw = h2o.lazy_import(tests.locate("smalldata/jira/hexdev_29.csv"))
    fsetup = h2o.parse_setup(fraw)
    fsetup["column_types"][0] = "ENUM"
    fsetup["column_types"][1] = "ENUM"
    fsetup["column_types"][2] = "ENUM"

    df_hex = h2o.parse_raw(fsetup)

    df_hex.summary()

    assert (df_hex['h1'].isfactor())
    assert (df_hex['h2'].isfactor())
    assert (df_hex['h3'].isfactor())
Ejemplo n.º 13
0
def h2oparse_setup():
    """
    Python API test: h2o.parse_setup(raw_frames, destination_frame=None, header=0, separator=None, column_names=None,
     column_types=None, na_strings=None)
    """
    col_types=['enum','numeric','enum','enum','enum','numeric','numeric','numeric']
    col_headers = ["CAPSULE","AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"]
    hex_key = "training_data.hex"

    fraw = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), parse=False)
    setup = h2o.parse_setup(fraw, destination_frame=hex_key, header=1, separator=',', column_names=col_headers,
                            column_types=col_types, na_strings=["NA"])
    assert_is_type(setup, H2OResponse)
    assert setup["number_columns"]==len(col_headers), "h2o.parse_setup() command is not working."
def continuous_or_categorical():
  fraw = h2o.lazy_import(h2o.locate("smalldata/jira/hexdev_29.csv"))
  fsetup = h2o.parse_setup(fraw)
  fsetup["column_types"][0] = "ENUM"
  fsetup["column_types"][1] = "ENUM"
  fsetup["column_types"][2] = "ENUM"

  df_hex = h2o.parse_raw(fsetup)

  df_hex.summary()

  assert (df_hex['h1'].isfactor())
  assert (df_hex['h2'].isfactor())
  assert (df_hex['h3'].isfactor())
Ejemplo n.º 15
0
def pubdev_6339():

    cluster = h2o.cluster()
    # number of nodes
    cloud_size = cluster.cloud_size
    # number of CPUs
    cores = sum(node["num_cpus"] for node in cluster.nodes)

    # path to file
    file_paths = [
        pyunit_utils.locate("smalldata/arcene/arcene_train.data"),
        pyunit_utils.locate("smalldata/census_income/adult_data.csv"),
        pyunit_utils.locate("smalldata/chicago/chicagoAllWeather.csv"),
        pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"),
        pyunit_utils.locate(
            "smalldata/wa_cannabis/raw/Dashboard_Usable_Sales_w_Weight_Daily.csv"
        )
    ]

    for file_path in file_paths:
        # read data and parse setup to get number of columns
        data_raw = h2o.import_file(path=file_path, parse=False)
        setup = h2o.parse_setup(data_raw)

        # get number of columns from setup
        num_cols = setup['number_columns']
        # get the chunk size
        chunk_size = calculate_chunk_size(file_path, num_cols, cores,
                                          cloud_size)

        # get chunk size to compare if calculation is correct
        result_size = setup['chunk_size']
        assert chunk_size == result_size, "Calculated chunk size is incorrect!"
        print("chunk size for file", file_path, "is:", chunk_size)

    data_raw = h2o.import_file(path=file_paths[1], parse=False)
    setup = h2o.parse_setup(data_raw)
Ejemplo n.º 16
0
def hexdev_394():
    path = tests.locate("smalldata/covtype/covtype.20k.data")
    trainraw = h2o.lazy_import(path)
    tsetup = h2o.parse_setup(trainraw)
    tsetup["column_types"][10] = "ENUM"
    tsetup["column_types"][11] = "ENUM"
    tsetup["column_types"][12] = "ENUM"
    train = h2o.parse_raw(tsetup)

    cols = train.col_names  # This returned space for first column name
    x_cols = [colname for colname in cols if colname != "C55"]
    x_cols

    splits = train.split_frame()
    newtrain = splits[0]
    newvalid = splits[1]
    newtrain_x = newtrain[x_cols]
    newtrain_y = newtrain[54].asfactor()
    newvalid_x = newvalid[x_cols]
    newvalid_y = newvalid[54].asfactor()

    my_gbm = h2o.gbm(y=newtrain_y,
                     validation_y=newvalid_y,
                     x=newtrain_x,
                     validation_x=newvalid_x,
                     distribution="multinomial",
                     ntrees=100,
                     learn_rate=0.1,
                     max_depth=6)

    split1, split2 = train.split_frame()

    newtrain_x = split1[x_cols]
    newtrain_y = split1[54].asfactor()
    newvalid_x = split2[x_cols]
    newvalid_y = split2[54].asfactor()

    my_gbm = h2o.gbm(y=newtrain_y,
                     validation_y=newvalid_y,
                     x=newtrain_x,
                     validation_x=newvalid_x,
                     distribution="multinomial",
                     ntrees=100,
                     learn_rate=0.1,
                     max_depth=6)

    print "KEEPING FRAME???"
    print train._keep
Ejemplo n.º 17
0
 def _handle_text_key(self, text_key, column_names):
   """
   Handle result of upload_file
   :param test_key: A key pointing to raw text to be parsed
   :return: Part of the H2OFrame constructor.
   """
   # perform the parse setup
   setup = h2o.parse_setup(text_key)
   # blocking parse, first line is always a header (since "we" wrote the data out)
   parse = h2o.parse(setup, H2OFrame.py_tmp_key(), first_line_is_header=1)
   # a hack to get the column names correct since "parse" does not provide them
   cols = parse['column_names'] if parse["column_names"] else ["C" + str(x) for x in range(1,len(parse['vec_ids'])+1)]
   # set the rows
   rows = parse['rows']
   # set the vector keys
   veckeys = parse['vec_ids']
   # create a new vec[] array
   self._vecs = H2OVec.new_vecs(zip(cols, veckeys), rows)
   # print some information on the *uploaded* data
   print "Uploaded", text_key, "into cluster with", rows, "rows and", len(cols), "cols"
Ejemplo n.º 18
0
 def _parse(rawkey, destination_frame="", header=None, separator=None, column_names=None, column_types=None, na_strings=None):
   setup = h2o.parse_setup(rawkey, destination_frame, header, separator, column_names, column_types, na_strings)
   return H2OFrame._parse_raw(setup)
Ejemplo n.º 19
0
  def __init__(self, python_obj=None, local_fname=None, remote_fname=None, vecs=None, text_key=None):
    """
    Create a new H2OFrame object by passing a file path or a list of H2OVecs.

    If `remote_fname` is not None, then a REST call will be made to import the
    data specified at the location `remote_fname`.  This path is relative to the
    H2O cluster, NOT the local Python process

    If `local_fname` is not None, then the data is not imported into the H2O cluster
    at the time of object creation.

    If `python_obj` is not None, then an attempt to upload the python object to H2O
    will be made. A valid python object has type `list`, or `dict`.

    For more information on the structure of the input for the various native python
    data types ("native" meaning non-H2O), please see the general documentation for
    this object.

    :param python_obj: A "native" python object - list, dict, tuple.
    :param local_fname: A local path to a data source. Data is python-process-local.
    :param remote_fname: A remote path to a data source. Data is cluster-local.
    :param vecs: A list of H2OVec objects.
    :param text_key: A raw key resulting from an upload_file.
    :return: An instance of an H2OFrame object.
    """
    self.local_fname = local_fname
    self.remote_fname = remote_fname
    self._vecs = None

    if python_obj is not None:  # avoids the truth value of an array is ambiguous err
      self._upload_python_object(python_obj)
      return

    # Import the data into H2O cluster
    if remote_fname:
      rawkey = h2o.import_file(remote_fname)
      setup = h2o.parse_setup(rawkey)
      parse = h2o.parse(setup, H2OFrame.py_tmp_key())  # create a new key
      veckeys = parse['vec_ids']
      rows = parse['rows']
      cols = parse['column_names'] if parse["column_names"] else ["C" + str(x) for x in range(1,len(veckeys)+1)]
      self._vecs = H2OVec.new_vecs(zip(cols, veckeys), rows)
      print "Imported", remote_fname, "into cluster with", rows, "rows and", len(cols), "cols"

    # Read data locally into python process
    elif local_fname:
      with open(local_fname, 'rb') as csvfile:
        self._vecs = []
        for name in csvfile.readline().split(','):
          self._vecs.append(H2OVec(name.rstrip(), Expr([])))
        for row in csv.reader(csvfile):
          for i, data in enumerate(row):
            self._vecs[i].append(data)
      print "Imported", local_fname, "into local python process"

    # Construct from an array of Vecs already passed in
    elif vecs:
      vlen = len(vecs[0])
      for v in vecs:
        if not isinstance(v, H2OVec):
          raise ValueError("Not a list of Vecs")
        if len(v) != vlen:
          raise ValueError("Vecs not the same size: " + str(vlen) + " != " + str(len(v)))
      self._vecs = vecs

    elif text_key:
      self._handle_text_key(text_key, None)

    else:
      raise ValueError("Frame made from CSV file or an array of Vecs only")