Example #1
0
def import_svmlight(path, headers=""):
    raw = h2o.lazy_import(path)
    if settings.debug and len(headers) < 100:
        print utils.time() + "import with headers: " + str(headers)
    #parsesetup = h2o.parse_setup(raw,column_names=headers)
    parsesetup = h2o.parse_setup(
        raw
    )  # Issue: H2O 3.8 tests length of header vs. columns, but still imports the "pseudotarget" additionally
    parsesetup['parse_type'] = 'SVMLight'
    loaded_frame = h2o.parse_raw(parsesetup)
    if settings.debug:
        print "......HEader length: " + str(len(headers))
        print "......Frame imported: " + str(loaded_frame.ncol)
    if (len(headers) > loaded_frame.ncol):
        n = len(headers) - loaded_frame.ncol
        print "Remove last " + str(n) + " header entries"
        del headers[-n:]
    loaded_frame.set_names(headers)  #Workaround, Set names now
    print "First column: " + loaded_frame.names[
        0]  #needed because lazy name setting
    if settings.debug and len(headers) < 100: loaded_frame.head(show=True)
    loaded_frame.pop(0)  #remove first ('pseudotarget') columnn
    #if loaded_frame.ncol>len(headers)-1: #workaround: H2O reads info from svmlight into columns -> remove everything that is not in headers
    #    delete = []
    #    for i in xrange(len(headers)-1,loaded_frame.ncol):
    #        delete.append(loaded_frame.names[i])
    #    loaded_frame = remove_vecs(loaded_frame,delete)
    if settings.debug and len(headers) < 100: loaded_frame.head(show=True)
    return loaded_frame
Example #2
0
def h2olazy_import():
    """
    Python API test: h2o.lazy_import(path)
    """
    training_data = h2o.lazy_import(
        pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    assert_is_type(training_data, list)
Example #3
0
    def __init__(self, mojo_path=None):
        """
        Create a new H2OMojoPipeline object.

        :param mojo_path path to a MOJO file.
        """
        assert_is_type(mojo_path, str)

        self.pipeline_id = h2o.lazy_import(mojo_path)
Example #4
0
def h2olazy_import():
    """
    Python API test: h2o.lazy_import(path)
    """
    try:
        training_data = h2o.lazy_import(
            pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    except Exception as e:
        assert False, "h2o.lazy_import() command is not working."
Example #5
0
def continuous_or_categorical():
    fraw = h2o.lazy_import(tests.locate("smalldata/jira/hexdev_29.csv"))
    fsetup = h2o.parse_setup(fraw)
    fsetup["column_types"][0] = "ENUM"
    fsetup["column_types"][1] = "ENUM"
    fsetup["column_types"][2] = "ENUM"

    df_hex = h2o.parse_raw(fsetup)

    df_hex.summary()

    assert (df_hex['h1'].isfactor())
    assert (df_hex['h2'].isfactor())
    assert (df_hex['h3'].isfactor())
Example #6
0
    def from_file(file=str):
        """
        Creates new Generic model by loading existing embedded model into library, e.g. from H2O MOJO.
        The imported model must be supported by H2O.
        :param file: A string containing path to the file to create the model from
        :return: H2OGenericEstimator instance representing the generic model
        """
        from h2o import lazy_import, get_frame
        model_key = lazy_import(file)
        model_bytes_frame = get_frame(model_key[0])
        model = H2OGenericEstimator(model_key=model_bytes_frame)
        model.train()

        return model
Example #7
0
def hexdev_394():
  path = tests.locate("smalldata/covtype/covtype.20k.data")
  trainraw = h2o.lazy_import(path)
  tsetup = h2o.parse_setup(trainraw)
  tsetup["column_types"][10] = "ENUM"
  tsetup["column_types"][11] = "ENUM"
  tsetup["column_types"][12] = "ENUM"
  train = h2o.parse_raw(tsetup)
  
  cols = train.col_names  # This returned space for first column name
  x_cols = [colname for colname in cols if colname != "C55"]
  x_cols
  
  
  splits = train.split_frame()
  newtrain = splits[0]
  newvalid = splits[1]
  newtrain_x = newtrain[x_cols]
  newtrain_y = newtrain[54].asfactor()
  newvalid_x = newvalid[x_cols]
  newvalid_y = newvalid[54].asfactor()
  
  
  my_gbm = h2o.gbm(y=newtrain_y,
                   validation_y=newvalid_y,
                   x=newtrain_x,
                   validation_x=newvalid_x,
                   distribution =  "multinomial",
                   ntrees=100,
                   learn_rate=0.1,
                   max_depth=6)
  
  split1, split2 = train.split_frame()
  
  newtrain_x = split1[x_cols]
  newtrain_y = split1[54].asfactor()
  newvalid_x = split2[x_cols]
  newvalid_y = split2[54].asfactor()
  
  my_gbm = h2o.gbm(y=newtrain_y,
                   validation_y=newvalid_y,
                   x=newtrain_x,
                   validation_x=newvalid_x,
                   distribution = "multinomial",
                   ntrees=100,
                   learn_rate=0.1,
                   max_depth=6) 

  print "KEEPING FRAME???"
  print train._keep
def continuous_or_categorical():
  fraw = h2o.lazy_import(h2o.locate("smalldata/jira/hexdev_29.csv"))
  fsetup = h2o.parse_setup(fraw)
  fsetup["column_types"][0] = "ENUM"
  fsetup["column_types"][1] = "ENUM"
  fsetup["column_types"][2] = "ENUM"

  df_hex = h2o.parse_raw(fsetup)

  df_hex.summary()

  assert (df_hex['h1'].isfactor())
  assert (df_hex['h2'].isfactor())
  assert (df_hex['h3'].isfactor())
Example #9
0
 def read_csv(file_path, destination_frame, header=(-1,0,1), separator="", column_names=None, column_types=None, na_strings=None):
   """
   Build an H2OFrame from parsing a CSV at file_path.  This path is relative to
   the H2O cluster, NOT the local Python process
   :param file_path:  A remote path to a data source.  Data is cluster-local.
   :param destination_frame:  The result *Key* name in the H2O cluster
   """
   rawkey = h2o.lazy_import(file_path)
   res = H2OFrame._parse(rawkey, destination_frame, header, separator, column_names, column_types, na_strings)
   nrows = res.nrow
   ncols = res.ncol
   if isinstance(file_path, str): print "Imported {}. Parsed {} rows and {} cols".format(file_path,"{:,}".format(nrows), "{:,}".format(ncols))
   else:                          h2o.H2ODisplay([["File"+str(i+1),f] for i,f in enumerate(file_path)],None, "Parsed {} rows and {} cols".format("{:,}".format(nrows), "{:,}".format(ncols)))
   return res
def test(x, y, output_test, strip_part, algo_name, generic_algo_name):
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    gbm = H2ODeepLearningEstimator(epochs=1)
    gbm.train(x=x, y=y, training_frame=airlines, validation_frame=airlines)
    print(gbm)
    with Capturing() as original_output:
        gbm.show()

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = gbm.download_mojo(original_model_filename)

    key = h2o.lazy_import(original_model_filename)
    fr = h2o.get_frame(key[0])
    generic_mojo_model = H2OGenericEstimator(model_key=fr)
    generic_mojo_model.train()
    compare_params(gbm, generic_mojo_model)
    print(generic_mojo_model)
    with Capturing() as generic_output:
        generic_mojo_model.show()

    output_test(str(original_output), str(generic_output), strip_part,
                algo_name, generic_algo_name)

    predictions = generic_mojo_model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert generic_mojo_model._model_json["output"][
        "model_summary"] is not None
    assert len(generic_mojo_model._model_json["output"]
               ["model_summary"]._cell_values) > 0

    # Test constructor generating the model from existing MOJO file
    generic_mojo_model_from_file = H2OGenericEstimator.from_file(
        original_model_filename)
    assert generic_mojo_model_from_file is not None
    predictions = generic_mojo_model_from_file.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert generic_mojo_model_from_file._model_json["output"][
        "model_summary"] is not None
    assert len(generic_mojo_model_from_file._model_json["output"]
               ["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = generic_mojo_model_from_file.download_mojo(
        path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(
        original_model_filename)
Example #11
0
def stackedensemble_mojo_model_test():
    train = h2o.import_file(
        pyunit_utils.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_test.csv"))
    x = train.columns
    y = "species"

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    gbm.train(x=x, y=y, training_frame=train)
    rf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True)
    rf.train(x=x, y=y, training_frame=train)
    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     validation_frame=test,
                                     base_models=[gbm.model_id, rf.model_id])
    se.train(x=x, y=y, training_frame=train)
    print(se)
    with Capturing() as original_output:
        se.show()

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = se.download_mojo(original_model_filename)

    key = h2o.lazy_import(original_model_filename)
    fr = h2o.get_frame(key[0])
    generic_mojo_model = H2OGenericEstimator(model_key=fr)
    generic_mojo_model.train()
    compare_params(se, generic_mojo_model)

    predictions = generic_mojo_model.predict(test)
    assert predictions is not None

    # Test constructor generating the model from existing MOJO file
    generic_mojo_model_from_file = H2OGenericEstimator.from_file(
        original_model_filename)
    assert generic_mojo_model_from_file is not None
    predictions = generic_mojo_model_from_file.predict(test)
    assert predictions is not None

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = generic_mojo_model_from_file.download_mojo(
        path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(
        original_model_filename)
def test(x, y, output_test, strip_part, algo_name, generic_algo_name):
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    xgb = H2OXGBoostEstimator(ntrees=1, nfolds=3)
    xgb.train(x=x, y=y, training_frame=airlines, validation_frame=airlines)
    print(xgb)
    with Capturing() as original_output:
        xgb.show()

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = xgb.download_mojo(original_model_filename)

    key = h2o.lazy_import(original_model_filename)
    fr = h2o.get_frame(key[0])
    model = H2OGenericEstimator(model_key=fr)
    model.train()
    print(model)
    with Capturing() as generic_output:
        model.show()

    output_test(str(original_output), str(generic_output), strip_part,
                algo_name, generic_algo_name)

    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["variable_importances"] is not None
    assert len(
        model._model_json["output"]["variable_importances"]._cell_values) > 0
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0

    # Test constructor generating the model from existing MOJO file
    model = H2OGenericEstimator.from_file(original_model_filename)
    assert model is not None
    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["variable_importances"] is not None
    assert len(
        model._model_json["output"]["variable_importances"]._cell_values) > 0
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(
        original_model_filename)
Example #13
0
def hexdev_394():
    path = tests.locate("smalldata/covtype/covtype.20k.data")
    trainraw = h2o.lazy_import(path)
    tsetup = h2o.parse_setup(trainraw)
    tsetup["column_types"][10] = "ENUM"
    tsetup["column_types"][11] = "ENUM"
    tsetup["column_types"][12] = "ENUM"
    train = h2o.parse_raw(tsetup)

    cols = train.col_names  # This returned space for first column name
    x_cols = [colname for colname in cols if colname != "C55"]
    x_cols

    splits = train.split_frame()
    newtrain = splits[0]
    newvalid = splits[1]
    newtrain_x = newtrain[x_cols]
    newtrain_y = newtrain[54].asfactor()
    newvalid_x = newvalid[x_cols]
    newvalid_y = newvalid[54].asfactor()

    my_gbm = h2o.gbm(y=newtrain_y,
                     validation_y=newvalid_y,
                     x=newtrain_x,
                     validation_x=newvalid_x,
                     distribution="multinomial",
                     ntrees=100,
                     learn_rate=0.1,
                     max_depth=6)

    split1, split2 = train.split_frame()

    newtrain_x = split1[x_cols]
    newtrain_y = split1[54].asfactor()
    newvalid_x = split2[x_cols]
    newvalid_y = split2[54].asfactor()

    my_gbm = h2o.gbm(y=newtrain_y,
                     validation_y=newvalid_y,
                     x=newtrain_x,
                     validation_x=newvalid_x,
                     distribution="multinomial",
                     ntrees=100,
                     learn_rate=0.1,
                     max_depth=6)

    print "KEEPING FRAME???"
    print train._keep
def mojo_model_test():

    # GBM
    airlines = h2o.import_file(
        path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    gbm = H2OGradientBoostingEstimator(ntrees=1)
    gbm.train(x=["Origin", "Dest"], y="IsDepDelayed", training_frame=airlines)

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = gbm.download_mojo(original_model_filename)

    key = h2o.lazy_import(original_model_filename)
    fr = h2o.get_frame(key[0])
    model = H2OGenericEstimator(model_key=fr)
    model.train()
    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["variable_importances"] is not None
    assert len(
        model._model_json["output"]["variable_importances"]._cell_values) > 0
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0

    # Test constructor generating the model from existing MOJO file
    model = H2OGenericEstimator.from_file(original_model_filename)
    assert model is not None
    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["variable_importances"] is not None
    assert len(
        model._model_json["output"]["variable_importances"]._cell_values) > 0
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0

    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo")
    generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(
        original_model_filename)
def mojo_model_test():

    # GBM
    airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv"))
    gbm = H2OGradientBoostingEstimator(ntrees = 1)
    gbm.train(x = ["Origin", "Dest"], y = "IsDepDelayed", training_frame=airlines)

    original_model_filename = tempfile.mkdtemp()
    original_model_filename = gbm.download_mojo(original_model_filename)
    
    key = h2o.lazy_import(original_model_filename)
    fr = h2o.get_frame(key[0])
    model = H2OGenericEstimator(model_key = fr)
    model.train()
    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["variable_importances"] is not None
    assert len(model._model_json["output"]["variable_importances"]._cell_values) > 0
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0
    
    # Test constructor generating the model from existing MOJO file
    model = H2OGenericEstimator.from_file(original_model_filename)
    assert model is not None
    predictions = model.predict(airlines)
    assert predictions is not None
    assert predictions.nrows == 24421
    assert model._model_json["output"]["variable_importances"] is not None
    assert len(model._model_json["output"]["variable_importances"]._cell_values) > 0
    assert model._model_json["output"]["model_summary"] is not None
    assert len(model._model_json["output"]["model_summary"]._cell_values) > 0 
    
    generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo");
    generic_mojo_filename = model.download_mojo(path=generic_mojo_filename)
    assert os.path.getsize(generic_mojo_filename) == os.path.getsize(original_model_filename)
Example #16
0
 def _import_parse(path, destination_frame, header, sep, column_names, column_types, na_strings):
   rawkey = h2o.lazy_import(path)
   return H2OFrame._parse(rawkey,destination_frame, header, sep, column_names, column_types, na_strings)
Example #17
0
def h2olazy_import():
    """
    Python API test: h2o.lazy_import(path)
    """
    training_data = h2o.lazy_import(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"))
    assert_is_type(training_data, list)