def test_load_glrm():
  print("Importing iris_wheader.csv data...")
  irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  irisH2O.describe()

  g_model = H2OGeneralizedLowRankEstimator(k=3)
  g_model.train(x=irisH2O.names, training_frame=irisH2O)
  yarch_old = g_model.archetypes()
  x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"])
  predOld = g_model.predict(irisH2O)
  TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results"))

  try:
    TMPDIR = pyunit_utils.locate("results")    # find directory path to results folder
  except:
    os.makedirs(TMPDIR)
  h2o.save_model(g_model, path=TMPDIR, force=True)       # save model
  full_path_filename = os.path.join(TMPDIR, g_model._id)

  h2o.remove(g_model)
  model_reloaded = h2o.load_model(full_path_filename)
  pred = model_reloaded.predict(irisH2O)
  yarch = model_reloaded.archetypes()
  x = h2o.get_frame(model_reloaded._model_json["output"]["representation_name"])

  # assert difference between old and new are close, archetypes should be the same
  pyunit_utils.compare_frames_local(x, x_old, tol=1e-6)
  pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1)
  for k in range(3):
    pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps = 1e-4, tolerance=1e-10)

  print("glrm model successfully loaded...")
def run_comparison_tests(auto_encoder, act_fun, missing_values_handling, set_all_factor, train, test, x):
    # set deeplearning model parameters
    params = set_params(act_fun, missing_values_handling, set_all_factor, auto_encoder) 
    
    if auto_encoder:
        try:
            # build and save mojo model
            deeplearning_model = build_save_model(params, x, train) 
        except Exception as err:
            if not("Trying to predict with an unstable model" in err.args[0]):
                raise Exception('Deeplearning autoencoder model failed to build.  Fix it.')
            return
    else:
        # build and save mojo model
        deeplearning_model = build_save_model(params, x, train) 

    # save test file, h2o predict/mojo use same file
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  
    # load model and perform predict
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearning_model, TMPDIR, MOJONAME)  
    pred_pojo = pyunit_utils.pojo_predict(deeplearning_model, TMPDIR, MOJONAME)
    # save model for debugging
    h2o.save_model(deeplearning_model, path=TMPDIR, force=True)  
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10)
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def deeplearning_mojo_pojo():
    h2o.remove_all()

    params = set_params()   # set deeplearning model parameters
    df = random_dataset(PROBLEM)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    try:
        deeplearningModel = build_save_model(params, x, train) # build and save mojo model
        h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
        pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearningModel, TMPDIR, MOJONAME)  # load model and perform predict
        pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
        h2o.save_model(deeplearningModel, path=TMPDIR, force=True)  # save model for debugging
        print("Comparing mojo predict and h2o predict...")
        pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-10)    # make sure operation sequence is preserved from Tomk
        print("Comparing pojo predict and h2o predict...")
        pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-10)
    except Exception as ex:
        print("***************  ERROR and type is ")
        print(str(type(ex)))
        print(ex)
        if "AssertionError" in str(type(ex)):   # only care if there is an AssertionError, ignore the others
            sys.exit(1)
def impute_data(method = "mean", 
                to_impute = to_impute,
                predictors = predictors):
  if method == "mean":
    print "Mean imputing missing data for predictors:", to_impute
    # find mean for each time period in data for each predictor, save them in a matrix with a col for the mean values of each predictor
    # then on holdout use this table to fill in all missing values based on the time period (row) and the variable (col) of this matrix
    
    #if using python module h2o-3.1.0.3131: grouped = data.group_by(["time_period"])
    #                         gm = [grouped.mean(predictor, na="rm").get_frame() for predictor in to_impute]
    gm = d["time_period"].unique()
    print "Finding means..."
    for predictor in to_impute:
      gm = gm.cbind(d.group_by(["time_period"], {predictor:["mean", d.names().index(predictor), "rm"]}, order_by = 0))
    gm.show()
    print "Saving the imputation means to disk..."
    h2o.download_csv(gm, filename = saving_means_fp)
    # df_py = h2o.as_list(gm)
    # Now that's stored for the holdout data, do this a faster way in java for the training data:
    for predictor in to_impute:
      d.impute(predictor, method='mean', by = ['time_period'], inplace = True)
      print "Done imputing", predictor
    print "Saving the final mean imputed data to disk..."
    h2o.export_file(frame = d, path =saving_meanImputed_fp, force=True)
  
  if method == "model":
    # sequentially impute 'newdata', not 'data', so the order of the predictor variables in the loop does not matter
    # otherwise, you would be using increasingly imputed data to make predictions as the loop progresses.
    newdata = d
    # With training data, build a model for each col and predict missing data, save the models, use them on the holdout data to predict all missing data.
    for predictor in to_impute:
      print "Building model for imputing " + predictor
      print "Subsetting the data into missing values for predictor and no missing values for predictor"
      na_ind = d[predictor].isna()
      not_na_ind = na_ind != 1.0
      to_train = d[not_na_ind]
      to_predict = d[na_ind]
      these_var = [var for var in predictors if var != predictor]
      trained = h2o.gbm(x = to_train[these_var],
                        y = to_train[[predictor]],
                        ntrees=300,
                        max_depth=6,
                        learn_rate=0.2)
      print "Saving the imputation tree model for " + predictor
      h2o.save_model(trained, dir = saving_models_fp, name = "dl_imputation_model_" + predictor)
      print "Imputing the missing " +  predictor + " data by predicting with the model..."
      predicted = trained.predict(to_predict[these_var])
      tofillin = newdata[predictor]
      assert len(predicted) == len(tofillin[na_ind])
      tofillin[na_ind] = predicted # mutate the column in place
      newdata[predictor] = tofillin
    
    print "Saving the final model-imputed data to disk..."
    h2o.export_file(frame = d, path =saving_modelImputed_fp, force=True)
Ejemplo n.º 5
0
def test_hadoop():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv")

    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    h2o_glm.train(x=range(1, 10), y=0, training_frame=h2o_data) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    h2o.save_model(h2o_glm, "hdfs://" + hdfs_model_path)

    new_model = h2o.load_model("hdfs://" + hdfs_model_path)
Ejemplo n.º 6
0
def test_hadoop():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    print("Importing hdfs data")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv")

    print("Spliting data")
    train,test = h2o_data.split_frame(ratios=[0.9])

    print("Exporting file to hdfs")
    h2o.export_file(test[:,0:2], "hdfs://" + hdfs_name_node + "/datasets/exported.csv")

    print("Reading file back in and comparing if data is the same")
    new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv")
    assert((test[:,1] - new_test[:,1]).sum() == 0)

    print("Training")
    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    h2o_glm.train(x=range(1, 10), y=0, training_frame=train) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    print("Saving model")
    new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path)
    print("Loading back model")
    new_model = h2o.load_model(new_model_path)
    print("Running predictions")
    preds = new_model.predict(test)
Ejemplo n.º 7
0
    def _save_internal(self, **kwargs):
        loc = kwargs.pop('location')
        model_loc = kwargs.pop('model_location')

        # first, save the estimator... if it's there
        ends_in_h2o = isinstance(self._final_estimator, H2OEstimator)
        if ends_in_h2o:
            force = kwargs.pop('force', False)
            self.model_loc_ = h2o.save_model(model=self._final_estimator, path=model_loc, force=force)

            # set the _final_estimator to None just for pickling
            self.est_name_ = self.steps[-1][0]

            # let's keep a pointer to the last step, so
            # after the pickling we can reassign it to retain state
            last_step_ = self.steps[-1]
            self.steps[-1] = None

        # now save the rest of things...
        with open(loc, 'wb') as output:
            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)

        # after pickle, we can add the last_step_ back in.
        # this allows re-use/re-predict after saving to disk
        if ends_in_h2o:
            self.steps[-1] = last_step_
Ejemplo n.º 8
0
def deepwater_checkpoint():
  if not H2ODeepWaterEstimator.available(): return

  ## build a model
  #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
  frame.drop(0)
  frame[1] = frame[1].asfactor()
  print(frame.head(5))
  model = H2ODeepWaterEstimator(epochs=50, learning_rate=1e-5, stopping_rounds=0, score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0)
  model.train(y=1, training_frame=frame)

  ## save the model
  model_path = h2o.save_model(model)

  ## delete everything - simulate cluster shutdown and restart
  h2o.remove_all()

  ## reimport the model and the frame
  model = h2o.load_model(model_path)
  #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
  frame.drop(0)
  frame[1] = frame[1].asfactor()
  
  ## delete the checkpoint file
  os.remove(model_path)

  ## continue training
  model2 = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-5, stopping_rounds=0,score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0, checkpoint=model.model_id)
  model2.train(y=1, training_frame=frame)
  model2.show()
Ejemplo n.º 9
0
def milsong_checkpoint(ip,port):

    milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    distribution = "gaussian"

    # build first model
    ntrees1 = random.sample(range(50,100),1)[0]
    max_depth1 = random.sample(range(2,6),1)[0]
    min_rows1 = random.sample(range(10,16),1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])

    # save the model, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree("delete_model")

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                     checkpoint=restored_model._id)

    # build the equivalent of model 2 in one shot
    model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
def test_hdfs_io():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    print("Importing hdfs data")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/airlines/airlines_all.05p.csv")

    print("Spliting data")
    for c in ["Month","DayofMonth","IsArrDelayed"]:
        h2o_data[c] = h2o_data[c].asfactor()
    myX = ["Month","DayofMonth","Distance"]
    train,test = h2o_data.split_frame(ratios=[0.9])

    print("Exporting file to hdfs")
    h2o.export_file(test[:,["Year","DayOfWeek"]], "hdfs://" + hdfs_name_node + "/datasets/exported.csv")

    print("Reading file back in and comparing if data is the same")
    new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv")
    assert((test[:,"DayOfWeek"] - new_test[:,"DayOfWeek"]).sum() == 0)

    print("Training")
    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    h2o_glm.train(x=myX, y="IsArrDelayed", training_frame=train) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    print("Saving model")
    new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path)
    print("Loading back model")
    new_model = h2o.load_model(new_model_path)
    print("Running predictions")
    preds = new_model.predict(test)
Ejemplo n.º 11
0
def milsong_checkpoint():

  milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
  milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
  distribution = "gaussian"

  # build first model
  ntrees1 = random.sample(range(50,100),1)[0]
  max_depth1 = random.sample(range(2,6),1)[0]
  min_rows1 = random.sample(range(10,16),1)[0]
  print "ntrees model 1: {0}".format(ntrees1)
  print "max_depth model 1: {0}".format(max_depth1)
  print "min_rows model 1: {0}".format(min_rows1)


  model1 = H2OGradientBoostingEstimator(ntrees=ntrees1,
                                        max_depth=max_depth1,
                                        min_rows=min_rows1,
                                        distribution=distribution)
  model1.train(x=range(1,milsong_train.ncol),
               y=0,
               training_frame=milsong_train,
               validation_frame=milsong_valid)

  # save the model, then load the model
  path = pyunit_utils.locate("results")

  assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
  model_path = h2o.save_model(model1, path=path, force=True)

  assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
  restored_model = h2o.load_model(model_path)

  # continue building the model
  ntrees2 = ntrees1 + 50
  max_depth2 = max_depth1
  min_rows2 = min_rows1
  print "ntrees model 2: {0}".format(ntrees2)
  print "max_depth model 2: {0}".format(max_depth2)
  print "min_rows model 2: {0}".format(min_rows2)
  model2 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                        max_depth=max_depth2,
                                        min_rows=min_rows2,
                                        distribution=distribution,
                                        checkpoint=restored_model.model_id)
  model2.train(x=range(1,milsong_train.ncol),
               y=0,
               training_frame=milsong_train,
               validation_frame=milsong_valid)

  model3 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                        max_depth=max_depth2,
                                        min_rows=min_rows2,
                                        distribution=distribution)

  model3.train(x=range(1,milsong_train.ncol),
               y=0,
               training_frame=milsong_train,
               validation_frame=milsong_valid)
Ejemplo n.º 12
0
    def _save_internal(self, **kwargs):
        check_is_fitted(self, 'best_estimator_')
        best_estimator = self.best_estimator_
        estimator = self.estimator

        # where we'll save things
        loc = kwargs.pop('location')
        model_loc = kwargs.pop('model_location')

        # need to save the h2o est before anything else. Note that since
        # we verify pre-fit that the _final_estimator is of type H2OEstimator,
        # we can assume nothing has changed internally...
        is_pipe = False
        if isinstance(best_estimator, H2OPipeline):
            self.est_name_ = best_estimator.steps[-1][0]  # don't need to duplicate--can use for base

            the_h2o_est = best_estimator._final_estimator
            the_base_est = estimator._final_estimator

            is_pipe = True
        else:
            # otherwise it's the H2OEstimator
            the_h2o_est = best_estimator
            the_base_est = estimator

        # get the key that will map to the new H2OEstimator
        self.est_type_ = _get_estimator_string(the_base_est)

        # first, save the best estimator's H2O piece...
        force = kwargs.pop('force', False)
        self.model_loc_ = h2o.save_model(model=the_h2o_est, path=model_loc, force=force)

        # set to none for pickling, and then restore state for scoring
        if is_pipe:
            last_step_ = best_estimator.steps[-1]
            best_estimator.steps[-1] = None

            base_last_step_ = estimator.steps[-1]
            estimator.steps[-1] = None
            self.base_estimator_parms_ = base_last_step_[1]._parms  # it's a tuple...
        else:
            last_step_ = self.best_estimator_
            base_last_step_ = self.estimator
            self.best_estimator_ = None
            self.estimator = None
            self.base_estimator_parms_ = base_last_step_._parms

            # now save the rest of things...
        with open(loc, 'wb') as output:
            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)

        # restore state for re-use
        if is_pipe:
            best_estimator.steps[-1] = last_step_
            estimator.steps[-1] = base_last_step_
        else:
            self.best_estimator_ = last_step_
            self.estimator = base_last_step_
def train_grid_classifier():
  global train_dataset
  train_dataset = get_train_dataset_path()
  training_data = h2o.import_file(train_dataset)
  test_data = h2o.import_file(train_dataset.replace('train', 'test'))
  for mtries,sample_rate in shuffled(list(itertools.product([1, 2, 3, 5, 6, 7], [0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 1.0]))):
    features_string = '_'.join(map(str, ['mtries', mtries, 'sample_rate', sample_rate]))
    model_path = sdir_path(get_classifier_name() + '_v7_' + features_string + '.h2o')
    if path.exists(model_path):
      continue
    print features_string
    try:
      classifier = h2o.estimators.H2ORandomForestEstimator(build_tree_one_node=True, mtries=mtries, sample_rate=sample_rate)
      classifier.train(x=training_data.columns[1:], y=training_data.columns[0], training_frame=training_data, validation_frame=test_data)
      h2o.save_model(classifier, model_path)
      print classifier
    except:
      print traceback.format_exc()
      continue
Ejemplo n.º 14
0
def save_model(model_id, dest_dir='.', mformat='json'):
    model = h2o.get_model(model_id)
    if mformat == 'mojo':
        return model.save_mojo(path=dest_dir)
        # model.download_mojo(path=dest_dir, get_genmodel_jar=True)
    elif mformat == 'binary':
        return h2o.save_model(model, path=dest_dir)
        # return h2o.download_model(model, path=dest_dir)
    else:
        return model.save_model_details(path=dest_dir)
Ejemplo n.º 15
0
def trainmodel():
	h2o.init()
	from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glme
	trainingdf = h2o.import_file(path = abspath('./trainingset.csv'))
	trainingdf["city"] = trainingdf["city"].asfactor()
	trainingdf["country"] = trainingdf["city"].asfactor()
	glm_classifier = glme(family = "gaussian")
	glm_classifier.train(x = ['amount','cost','ratio','duration','city','country','ontime','notontime','history','posvote','negvote','fees','feeratio','pastscore'],y = 'score', training_frame = trainingdf)
	savedir = h2o.save_model(glm_classifier, path = curdir, force = True)
	rename(basename(savedir),"model")
Ejemplo n.º 16
0
def h2osave_model():
    """
    Python API test: h2o.save_model(model, path=u'', force=False)
    """
    training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
    Y = 3
    X = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10]

    model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5)
    model.train(x=X, y=Y, training_frame=training_data)
    try:
        results_dir = pyunit_utils.locate("results")    # find directory path to results folder
        h2o.save_model(model, path=results_dir, force=True)       # save model
        assert os.path.isfile(os.path.join(results_dir, model._id)), "h2o.save_model() command is not working."
    except Exception as e:
        if 'File not found' in e.args[0]:
            print("Directory is not writable.  h2o.save_model() command is not tested.")
        else:
            assert False, "h2o.save_model() command is not working."
def milsong_checkpoint():

    milsong_train = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    distribution = "gaussian"

    # build first model
    ntrees1 = random.sample(range(50, 100), 1)[0]
    max_depth1 = random.sample(range(2, 6), 1)[0]
    min_rows1 = random.sample(range(10, 16), 1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)

    from h2o.estimators.gbm import H2OGradientBoostingEstimator
    model1 = H2OGradientBoostingEstimator(ntrees=ntrees1,
                                          max_depth=max_depth1,
                                          min_rows=min_rows1,
                                          distribution=distribution)
    model1.train(x=range(1, milsong_train.ncol),
                 y=0,
                 training_frame=milsong_train,
                 validation_frame=milsong_valid)

    # save the model, then load the model
    path = pyunit_utils.locate("results")

    assert os.path.isdir(
        path), "Expected save directory {0} to exist, but it does not.".format(
            path)
    model_path = h2o.save_model(model1, path=path, force=True)

    assert os.path.isdir(
        model_path
    ), "Expected load directory {0} to exist, but it does not.".format(
        model_path)
    restored_model = h2o.load_model(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model1 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                          max_depth=max_depth2,
                                          min_rows=min_rows2,
                                          distribution=distribution)
    model1.train(x=range(1, milsong_train.ncol),
                 y=0,
                 training_frame=milsong_train,
                 validation_frame=milsong_valid)
Ejemplo n.º 18
0
def milsong_checkpoint(ip, port):

    milsong_train = h2o.upload_file(
        h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(
        h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    distribution = "gaussian"

    # build first model
    ntrees1 = random.sample(range(50, 100), 1)[0]
    max_depth1 = random.sample(range(2, 6), 1)[0]
    min_rows1 = random.sample(range(10, 16), 1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.gbm(x=milsong_train[1:],
                     y=milsong_train[0],
                     ntrees=ntrees1,
                     max_depth=max_depth1,
                     min_rows=min_rows1,
                     distribution=distribution,
                     validation_x=milsong_valid[1:],
                     validation_y=milsong_valid[0])

    # save the model, then load the model
    model_path = h2o.save_model(model1, force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.gbm(x=milsong_train[1:],
                     y=milsong_train[0],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     validation_x=milsong_valid[1:],
                     validation_y=milsong_valid[0],
                     checkpoint=restored_model._id)

    # build the equivalent of model 2 in one shot
    model3 = h2o.gbm(x=milsong_train[1:],
                     y=milsong_train[0],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     validation_x=milsong_valid[1:],
                     validation_y=milsong_valid[0])
Ejemplo n.º 19
0
def save_model(model, path, model_type=None):
    """
    save model
    :param model: a model to be saved
    :param path: dir to save model
    :return: saveed path ( dir + saved filename)
    """
    if model_type == 'mojo':
        return model.save_mojo(path=path, force=True)
    else:
        return h2o.save_model(model, path)
Ejemplo n.º 20
0
 def save(self):
     print("Saving artifacts")
     if "metadata" in self.artifact_config:
         print("SAVING METADATA")
         save_artifact(self.metadata, self.artifact_config["metadata_path"])
     if "model_path" in self.artifact_config:
         model_artifact_name = h2o.save_model(self.automl_pipeline.leader,
                                              path=".")
         with open(model_artifact_name, "rb") as fname:
             serialized_model = fname.read()
             write(serialized_model, self.artifact_config["model_path"])
Ejemplo n.º 21
0
    def save(self, path):
        """
        Save the rulefit model.
        :param path: The path to the directory where the models should be saved.
        :examples:
        >>> rulefit = H2ORuleFit()
        >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", 
        ...                                  col_types = {'pclass': "enum", 'survived': "enum"})
        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
        >>> rulefit.train(x=x,y="survived",training_frame=training_data)
        >>> rulefit.save(dir_path = "/home/user/my_rulefit/")
        """
        # save random forest models
        for rf_model in self.rf_models.values():
            h2o.save_model(rf_model, path=path)

        # save glm model
        h2o.save_model(self.glm, path=path)

        return path
Ejemplo n.º 22
0
def RandomForest(estimators=10):
    model = H2ORandomForestEstimator(model_id="rf_steel_plates" +
                                     str(random.randint(1, 10000)),
                                     ntrees=200,
                                     stopping_rounds=2,
                                     score_each_iteration=True,
                                     seed=1000000)
    model.train(X_train, Y_train, training_frame=df)
    path = h2o.save_model(model, path=os.getcwd())
    result = model.predict(test[:-8])
    print(model.model_performance(test))
Ejemplo n.º 23
0
    def save(self, dst):
        try:
            import h2o
        except ImportError:
            raise MissingDependencyException(
                "h2o package is required to use H2oModelArtifact"
            )

        h2o_saved_path = h2o.save_model(model=self._model, path=dst, force=True)
        shutil.move(h2o_saved_path, self._model_file_path(dst))
        return
Ejemplo n.º 24
0
def save_load_model(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    prostate = h2o.import_frame(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial",
                           alpha = [0.5])
    model_path = h2o.save_model(prostate_glm, name="delete_model", force=True)
    the_model = h2o.load_model(model_path)

    assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
Ejemplo n.º 25
0
def train_model(config, pt_bin, yaml_file, prefix):
    train_f = definitions.PROCESSING_FOLDER + config + '/ml-dataset/ml_sample_train_' + str(
        pt_bin) + '.parquet'
    train = h2o.import_file(train_f)

    d_cuts = configyaml.ConfigYaml(yaml_file)

    # Configuration of the GRID Search
    features = d_cuts.values['model_building']['features']
    target = d_cuts.values['model_building']['target']
    parameters = d_cuts.values['model_building']['model_parameters']
    train[target] = train[target] > -1

    train[target] = train[target].asfactor()

    model = H2OXGBoostEstimator(**parameters)

    model.train(features, target, training_frame=train)

    place_to_save = definitions.PROCESSING_FOLDER + config + '/ml-dataset/'
    file_list_saved = list()

    # Save Main model
    path_main = h2o.save_model(model, place_to_save, force=True)
    path_main_rename = ''.join([
        x + '/' for x in path_main.split('/')[:-1]
    ]) + prefix + 'model_pt' + str(pt_bin) + '_main'
    os.rename(path_main, path_main_rename)
    file_list_saved.append(path_main_rename)

    model_list = model.cross_validation_models()
    for model_cv, i in zip(model_list, range(len(model_list))):
        path = h2o.save_model(model_cv, place_to_save, force=True)
        path_new = ''.join([
            x + '/' for x in path.split('/')[:-1]
        ]) + prefix + 'model_pt' + str(pt_bin) + '_cv' + str(i)
        os.rename(path, path_new)
        file_list_saved.append(path_new)

    return model, model_list, file_list_saved
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    df = pyunit_utils.random_dataset("regression", seed=1234)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict

    h2o.save_model(glrmModel, TMPDIR)   # save GLRM model
    glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME))
    predict_model = glrmModel2.predict(test)
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
            predict_model[col] = predict_model[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
    print("Comparing mojo predict and h2o predict from saved model...")
    pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10)
    frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor
    glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID)   # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
Ejemplo n.º 27
0
def create_model(params):
    """Creates model based on parameters.

    Args:
        params (dict): Model parameters.

    Returns:
        None

    """
    # Create and train model
    model = H2ODeepLearningEstimator(**params)
    model.train(x='x', y='y', training_frame=train, validation_frame=val)

    # Run model prediction
    pred_train_val = model.predict(train_val).as_data_frame()
    pred_test = model.predict(test).as_data_frame()

    # Plot real data
    plt.plot(df_train_val['x'], df_train_val['y'], color='orange')
    plt.plot(df_test['x'][:len(df_test)/2], df_test['y'][:len(df_test)/2], color='orange')
    plt.plot(df_test['x'][len(df_test)/2:], df_test['y'][len(df_test)/2:], color='orange')

    # Plot model predictions
    plt.plot(df_train_val['x'], pred_train_val, color='blue')
    plt.plot(df_test['x'][:len(df_test)/2], pred_test[:len(pred_test)/2], color='blue')
    plt.plot(df_test['x'][len(df_test)/2:], pred_test[len(pred_test)/2:], color='blue')

    # Get model metrics
    test_rmse = '{:.1f}'.format(model.model_performance(test).rmse())
    train_val_rmse = '{:.1f}'.format(model.model_performance(train_val).rmse())
    samples_per_iteration = int(round(model.train_samples_per_iteration*1.258/len(df_train_val)))

    # Save model and plot
    name = '{0}_{1}_{2}_{3}'.format(test_rmse, train_val_rmse, model.activation, samples_per_iteration)
    h2o.save_model(model, os.path.join(save_dir, name))
    plt.savefig('{}.svg'.format(os.path.join(save_dir, name)))

    # Close plot
    plt.close()
Ejemplo n.º 28
0
def test_load_glrm():
    print("Importing iris_wheader.csv data...")
    irisH2O = h2o.upload_file(
        pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    irisH2O.describe()

    g_model = H2OGeneralizedLowRankEstimator(k=3)
    g_model.train(x=irisH2O.names, training_frame=irisH2O)
    yarch_old = g_model.archetypes()
    x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"])
    predOld = g_model.predict(irisH2O)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..",
                     "results"))

    try:
        TMPDIR = pyunit_utils.locate(
            "results")  # find directory path to results folder
    except:
        os.makedirs(TMPDIR)
    h2o.save_model(g_model, path=TMPDIR, force=True)  # save model
    full_path_filename = os.path.join(TMPDIR, g_model._id)

    h2o.remove(g_model)
    model_reloaded = h2o.load_model(full_path_filename)
    pred = model_reloaded.predict(irisH2O)
    yarch = model_reloaded.archetypes()
    x = h2o.get_frame(
        model_reloaded._model_json["output"]["representation_name"])

    # assert difference between old and new are close, archetypes should be the same
    pyunit_utils.compare_frames_local(x, x_old, tol=1e-6)
    pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1)
    for k in range(3):
        pyunit_utils.equal_two_arrays(yarch_old[k],
                                      yarch[k],
                                      eps=1e-4,
                                      tolerance=1e-10)

    print("glrm model successfully loaded...")
Ejemplo n.º 29
0
def save_load_model():
    
    

    prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial",
                           alpha = [0.5])
    model_path = h2o.save_model(prostate_glm,force=True)
    the_model = h2o.load_model(model_path)
    shutil.rmtree(model_path)

    assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
Ejemplo n.º 30
0
def save_load_model(ip,port):
    
    

    prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial",
                           alpha = [0.5])
    model_path = h2o.save_model(prostate_glm,force=True)
    the_model = h2o.load_model(model_path)
    shutil.rmtree(model_path)

    assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
Ejemplo n.º 31
0
def save_load_model():
    prostate = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()

    prostate_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5])
    prostate_glm.train(x=["AGE","RACE","PSA","DCAPS"], y="CAPSULE", training_frame=prostate)
    path = pyunit_utils.locate("results")

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(prostate_glm, path=path, force=True)

    assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
    the_model = h2o.load_model(model_path)

    assert isinstance(the_model, H2OEstimator), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def explore_classification(label_var='bogus_online',create_plots=False):
    """
    see also file shekhar_bogus_ml.py for the source of some of this
    """
    feature_sets = FEATURE_SETS

    rf_models = [H2ORandomForestEstimator(
            model_id="rf_v{}".format(i),
            ntrees=200,
            stopping_rounds=2,
            score_each_iteration=True,
            seed=1000000) \
        for i in xrange(1,8)]

    raw_input('train all models - takes an hour! Are you sure? [Ctrl-C to abort]')
    for i in xrange(len(feature_sets)):
        print "Building Model {}".format(i+1)
        rf_models[i].train(feature_sets[i], label_var, training_frame=TrainData, validation_frame=ValidData)

    if create_plots:
        legends=["Return features","Profile features","Network features","1 + 2","1 + 3","2 + 3","1 + 2 + 3"]
        #legends=["return_features","ds_features","return_features+ds_features"]

        plot=compare_models(rf_models,legends, of=r'D:\shekhar_code_github\BogusFirmCatching\Graphs\{label}_comparison_plot_AllCombinations_minusq12_numericmerge_withds.html'.format(label=label_var),\
                            title='Comparing All Models, {label}'.format(label=label_var))
        show(plot)

        for i in xrange(len(rf_models)):
            h2o.save_model(rf_models[i],path=r'D:\shekhar_code_github\BogusFirmCatching\Models\diff_feature_sets\20170523')

        for i in xrange(7):
            show(analyze_model(rf_models[i],of=r"D:\shekhar_code_github\BogusFirmCatching\Graphs\{}_model{}_v2_numericmerge_withds.html".format(label_var,i+1),n_rows=30))

    file_name = r'Z:\Predictions_{label}_v2_numericmerge_withDS.csv'.format(label=label_var)
    generate_predictions(rf_models,ValidData,file_name,'{label}_Model'.format(label=label_var))
    predictions = pd.read_csv(file_name)
def runComparisonTests(autoEncoder, probleyType):
    params = set_params(autoEncoder)  # set deeplearning model parameters
    df = random_dataset(probleyType)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = list(set(df.names) - {"response"})

    if autoEncoder:
        try:
            deeplearningModel = build_save_model(
                params, x, train)  # build and save mojo model
        except Exception as err:
            if not ("Trying to predict with an unstable model" in err.args[0]):
                raise Exception(
                    'Deeplearning autoencoder model failed to build.  Fix it.')
            return
    else:
        deeplearningModel = build_save_model(
            params, x, train)  # build and save mojo model
    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        deeplearningModel, TMPDIR, MOJONAME)  # load model and perform predict
    pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME)
    h2o.save_model(deeplearningModel, path=TMPDIR,
                   force=True)  # save model for debugging
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o,
                                                   pred_mojo,
                                                   prob=1,
                                                   tol=1e-10)
    print("Comparing pojo predict and h2o predict...")
    pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo,
                                                   pred_pojo,
                                                   prob=1,
                                                   tol=1e-10)
 def best_case_model(self, path: str = None) -> str:
     """ The best model that can be produced for future predictions
     Args:
         path (str): Optional location of saved model
     Return:
     """
     if self._best_case_model is None:
         self.model_shell.calibrate_thresholds(self.data)
         if path is None:
             path = os.path.join(self.dir_path, 'models/')
         self._best_case_model = h2o.save_model(self.model_shell.model,
                                                path=path,
                                                force=True)
         print(type(self._best_case_model))
     return self._best_case_model
Ejemplo n.º 35
0
def milsong_checkpoint():

    milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))

    # build first model
    ntrees1 = random.sample(range(50, 100), 1)[0]
    max_depth1 = random.sample(range(2, 6), 1)[0]
    min_rows1 = random.sample(range(10, 16), 1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = H2ORandomForestEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, seed=1234)

    model1.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)

    # save the model, then load the model
    path = pyunit_utils.locate("results")

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(model1, path=path, force=True)

    assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
    restored_model = h2o.load_model(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)

    model2 = H2ORandomForestEstimator(
        ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, checkpoint=restored_model._id, seed=1234
    )
    model2.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)

    # build the equivalent of model 2 in one shot
    model3 = H2ORandomForestEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, seed=1234)
    model3.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)

    assert isinstance(model2, type(model3))
    assert model2.mse(valid=True) == model3.mse(
        valid=True
    ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(
        model2.mse(valid=True), model3.mse(valid=True)
    )
Ejemplo n.º 36
0
def trainmodel():
    h2o.init()
    from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glme
    trainingdf = h2o.import_file(path=abspath('./trainingset.csv'))
    trainingdf["city"] = trainingdf["city"].asfactor()
    trainingdf["country"] = trainingdf["city"].asfactor()
    glm_classifier = glme(family="gaussian")
    glm_classifier.train(x=[
        'amount', 'cost', 'ratio', 'duration', 'city', 'country', 'ontime',
        'notontime', 'history', 'posvote', 'negvote', 'fees', 'feeratio',
        'pastscore'
    ],
                         y='score',
                         training_frame=trainingdf)
    savedir = h2o.save_model(glm_classifier, path=curdir, force=True)
    rename(basename(savedir), "model")
Ejemplo n.º 37
0
def save_load_model():

    prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial",
                           alpha = [0.5])

    path = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results"))

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(prostate_glm, path=path, force=True)

    assert os.path.isdir(model_path), "Expected load directory {0} to exist, but it does not.".format(model_path)
    the_model = h2o.load_model(model_path)

    assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
Ejemplo n.º 38
0
def save_model_filename():
    fr = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))

    model = H2OGradientBoostingEstimator(ntrees=10, seed=1234)
    model.train(x=list(range(2, fr.ncol)), y=1, training_frame=fr)

    # Default filename is model_id
    model_path = h2o.save_model(model)
    # It should be saved in server working directory
    assert model_path.endswith(
        model.model_id), "Not expected path: {0}".format(model_path)
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Default filename is model_id
    tmpdir = tempfile.mkdtemp()
    model_path = h2o.save_model(model, tmpdir)
    assert_equals(os.path.join(tmpdir, model.model_id), model_path,
                  "Not expected path")
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Custom filename with custom path
    model_path = h2o.save_model(model, tmpdir, filename="gbm_prostate")
    assert_equals(os.path.join(tmpdir, "gbm_prostate"), model_path,
                  "Not expected path")
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Custom filename with custom path
    model_path = h2o.save_model(model, tmpdir, filename="gbm_prostate.model")
    assert_equals(os.path.join(tmpdir, "gbm_prostate.model"), model_path,
                  "Not expected path")
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Custom filename with custom path
    model_path = h2o.save_model(model,
                                tmpdir,
                                filename=os.path.join("not-existing-folder",
                                                      "gbm_prostate.model"))
    assert_equals(
        os.path.join(tmpdir, "not-existing-folder", "gbm_prostate.model"),
        model_path, "Not expected path")
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Custom filename with default path
    model_path = h2o.save_model(model, filename="gbm_prostate_saved.model")
    assert model_path.endswith("gbm_prostate_saved.model"
                               ), "Not expected path: {0}".format(model_path)
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)
Ejemplo n.º 39
0
def deepwater_checkpoint():
    if not H2ODeepWaterEstimator.available(): return

    ## build a model
    #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
    frame = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    frame.drop(0)
    frame[1] = frame[1].asfactor()
    print(frame.head(5))
    model = H2ODeepWaterEstimator(epochs=50,
                                  learning_rate=1e-5,
                                  stopping_rounds=0,
                                  score_duty_cycle=1,
                                  train_samples_per_iteration=-1,
                                  score_interval=0)
    model.train(y=1, training_frame=frame)

    ## save the model
    model_path = h2o.save_model(model)

    ## delete everything - simulate cluster shutdown and restart
    h2o.remove_all()

    ## reimport the model and the frame
    model = h2o.load_model(model_path)
    #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
    frame = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    frame.drop(0)
    frame[1] = frame[1].asfactor()

    ## delete the checkpoint file
    os.remove(model_path)

    ## continue training
    model2 = H2ODeepWaterEstimator(epochs=100,
                                   learning_rate=1e-5,
                                   stopping_rounds=0,
                                   score_duty_cycle=1,
                                   train_samples_per_iteration=-1,
                                   score_interval=0,
                                   checkpoint=model.model_id)
    model2.train(y=1, training_frame=frame)
    model2.show()
Ejemplo n.º 40
0
def isolation_forest_save_and_load():
    print("Isolation Forest Smoke Test")

    train = h2o.import_file(pyunit_utils.locate("smalldata/anomaly/ecg_discord_train.csv"))

    if_model = H2OIsolationForestEstimator(ntrees=7, seed=12, sample_size=5)
    if_model.train(training_frame=train)

    path = pyunit_utils.locate("results")

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(if_model, path=path, force=True)

    assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
    reloaded = h2o.load_model(model_path)

    assert isinstance(reloaded, H2OIsolationForestEstimator), "Expected and H2OIsolationForestEstimator, but got {0}"\
        .format(reloaded)
Ejemplo n.º 41
0
def test_hadoop():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    print("Importing hdfs data")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv")
    print("Spliting data")
    train,test = h2o_data.split_frame(ratios=[0.1])
    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    print("Training")
    h2o_glm.train(x=range(1, 10), y=0, training_frame=train) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    print("Saving model")
    new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path)
    print("Loading back model")
    new_model = h2o.load_model(new_model_path)
def extended_isolation_forest_save_and_load():
    print("Extended Isolation Forest Save Load Test")

    train = h2o.import_file(
        pyunit_utils.locate("smalldata/anomaly/single_blob.csv"))

    eif_model = H2OExtendedIsolationForestEstimator(ntrees=7,
                                                    seed=12,
                                                    sample_size=5)
    eif_model.train(training_frame=train)
    anomaly_score = eif_model.predict(train)
    anomaly = anomaly_score['anomaly_score'].as_data_frame(
        use_pandas=True)["anomaly_score"]

    path = pyunit_utils.locate("results")

    assert os.path.isdir(
        path), "Expected save directory {0} to exist, but it does not.".format(
            path)
    model_path = h2o.save_model(eif_model, path=path, force=True)

    assert os.path.isfile(
        model_path
    ), "Expected load file {0} to exist, but it does not.".format(model_path)
    reloaded = h2o.load_model(model_path)
    anomaly_score_reloaded = reloaded.predict(train)
    anomaly_reloaded = anomaly_score_reloaded['anomaly_score'].as_data_frame(
        use_pandas=True)["anomaly_score"]

    assert isinstance(reloaded,
                      H2OExtendedIsolationForestEstimator), \
        "Expected and H2OExtendedIsolationForestEstimator, but got {0}"\
        .format(reloaded)

    assert (anomaly[0] == anomaly_reloaded[0]
            ), "Output is not the same after reload"
    assert anomaly[5] == anomaly_reloaded[
        5], "Output is not the same after reload"
    assert anomaly[33] == anomaly_reloaded[
        33], "Output is not the same after reload"
    assert anomaly[256] == anomaly_reloaded[
        256], "Output is not the same after reload"
    assert anomaly[499] == anomaly_reloaded[
        499], "Output is not the same after reload"
Ejemplo n.º 43
0
def main():
    h2o.init()

    #df = h2o.import_file(path="smalldata/logreg/prostate.csv")
    prostate = h2o.load_dataset("prostate")
    prostate.describe()

    train, test = prostate.split_frame(ratios=[0.70])
    train["CAPSULE"] = train["CAPSULE"].asfactor()
    test["CAPSULE"] = test["CAPSULE"].asfactor()

    # Train model
    from h2o.estimators import H2OGeneralizedLinearEstimator
    prostate_glm = H2OGeneralizedLinearEstimator(family="binomial",
                                                 alpha=[0.5])
    prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                       y="CAPSULE",
                       training_frame=train)
    prostate_glm.show()

    predictions = prostate_glm.predict(test)
    predictions.show()

    performance = prostate_glm.model_performance(test)
    performance.show()

    # Export model
    model_path = h2o.save_model(prostate_glm, path="./h2o_model", force=True)
    print(model_path)

    model = prostate_glm
    predictions = model.predict(test)
    predictions.show()

    performance = model.model_performance(test)
    performance.show()

    # Export test data
    df = test.as_data_frame()
    with open("data.json", "w") as f:
        #json.dump(df.to_json(orient='records'), f)
        #json.dump(df.to_json(orient='columns'), f)
        json.dump(df.to_json(orient='index'), f)
Ejemplo n.º 44
0
def train_h2o(X: pd.DataFrame, y: pd.Series, config: Config):
    h2o.init()

    X["target"] = y
    train = h2o.H2OFrame(X)
    train_x = train.columns
    train_y = "target"
    train_x.remove(train_y)

    if config["mode"] == "classification":
        train[train_y] = train[train_y].asfactor()

    aml = H2OAutoML(max_runtime_secs=60)
    aml.train(x=train_x, y=train_y, training_frame=train)

    config["model_h2o"] = h2o.save_model(model=aml.leader, path=config.model_dir + "/h2o.model", force=True)
    print(aml.leaderboard)

    X.drop("target", axis=1, inplace=True)
Ejemplo n.º 45
0
def save_model(h2o_model,
               path,
               conda_env=None,
               mlflow_model=Model(),
               settings=None):
    """
    Save a H2O model to a path on the local file system.

    :param h2o_model: H2O model to be saved.
    :param path: Local path where the model is to be saved.
    :param mlflow_model: MLflow model config this flavor is being added to.
    """
    import h2o

    path = os.path.abspath(path)
    if os.path.exists(path):
        raise Exception("Path '{}' already exists".format(path))
    model_dir = os.path.join(path, "model.h2o")
    os.makedirs(model_dir)

    # Save h2o-model
    h2o_save_location = h2o.save_model(model=h2o_model,
                                       path=model_dir,
                                       force=True)
    model_file = os.path.basename(h2o_save_location)

    # Save h2o-settings
    if settings is None:
        settings = {}
    settings['full_file'] = h2o_save_location
    settings['model_file'] = model_file
    settings['model_dir'] = model_dir
    with open(os.path.join(model_dir, "h2o.yaml"), 'w') as settings_file:
        yaml.safe_dump(settings, stream=settings_file)

    pyfunc.add_to_model(mlflow_model,
                        loader_module="mlflow.h2o",
                        data="model.h2o",
                        env=conda_env)
    mlflow_model.add_flavor("h2o",
                            saved_model=model_file,
                            h2o_version=h2o.__version__)
    mlflow_model.save(os.path.join(path, "MLmodel"))
Ejemplo n.º 46
0
def save_model(h2o_model, path, conda_env=None, mlflow_model=Model(), settings=None):
    """
    Save an H2O model to a path on the local file system.

    :param h2o_model: H2O model to be saved.
    :param path: Local path where the model is to be saved.
    :param conda_env: Path to a Conda environment file. If provided, this decribes the environment
                      this model should be run in. At minimum, it should specify the dependencies
                      contained in ``mlflow.h2o.DEFAULT_CONDA_ENV``. If `None`, the default
                      ``mlflow.h2o.DEFAULT_CONDA_ENV`` environment will be added to the model.
    :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to.
    """
    path = os.path.abspath(path)
    if os.path.exists(path):
        raise Exception("Path '{}' already exists".format(path))
    model_data_subpath = "model.h2o"
    model_data_path = os.path.join(path, model_data_subpath)
    os.makedirs(model_data_path)

    # Save h2o-model
    h2o_save_location = h2o.save_model(model=h2o_model, path=model_data_path, force=True)
    model_file = os.path.basename(h2o_save_location)

    # Save h2o-settings
    if settings is None:
        settings = {}
    settings['full_file'] = h2o_save_location
    settings['model_file'] = model_file
    settings['model_dir'] = model_data_path
    with open(os.path.join(model_data_path, "h2o.yaml"), 'w') as settings_file:
        yaml.safe_dump(settings, stream=settings_file)

    conda_env_subpath = "conda.yaml"
    if conda_env:
        shutil.copyfile(conda_env, os.path.join(path, conda_env_subpath))
    else:
        with open(os.path.join(path, conda_env_subpath), "w") as f:
            yaml.safe_dump(DEFAULT_CONDA_ENV, stream=f, default_flow_style=False)

    pyfunc.add_to_model(mlflow_model, loader_module="mlflow.h2o",
                        data=model_data_subpath, env=conda_env_subpath)
    mlflow_model.add_flavor(FLAVOR_NAME, h2o_version=h2o.__version__, data=model_data_subpath)
    mlflow_model.save(os.path.join(path, "MLmodel"))
Ejemplo n.º 47
0
def log_artifacts(grid_model, script, coeff=False):
    # check if jar file exists, if it does then delete it as it will already be backed up in mlruns folder
    if os.path.isfile(os.getcwd() + "/h2o-genmodel.jar"):
        os.remove(os.getcwd() + "/h2o-genmodel.jar")

    if os.path.isfile(os.getcwd() + "/var_imp.png"):
        os.remove(os.getcwd() + "/var_imp.png")

    if os.path.isfile(os.getcwd() + "/var_imp.csv"):
        os.remove(os.getcwd() + "/var_imp.csv")

    model_python = h2o.save_model(model=grid_model, path=os.getcwd(), force=True)
    # mlflow.log_artifact(model_python)
    mlflow.log_artifact(model_python)

    mojo_path = grid_model.download_mojo(path=os.getcwd(),
                                         get_genmodel_jar=True)

    grid_model.varimp_plot(server=True)
    plt.savefig('var_imp.png', bbox_inches='tight')

    var_imps = pd.DataFrame(grid_model.varimp(),
                            columns=['name', 'relative_importance', 'scaled_importance', 'percentage'])

    var_imps.to_csv("var_imp.csv", index=False)

    # Log mojo file and jar file
    mlflow.log_artifact(mojo_path)

    mlflow.log_artifact(os.getcwd() + "/h2o-genmodel.jar")
    mlflow.log_artifact(os.getcwd() + "/var_imp.png")
    mlflow.log_artifact(os.getcwd() + "/var_imp.csv")
    mlflow.log_artifact(os.getcwd() + "/" + script)

    if coeff:
        if os.path.isfile(os.getcwd() + "/coefficients-{}.csv".format(datetime.utcnow().date())):
            os.remove(os.getcwd() + "/coefficients-{}.csv".format(datetime.utcnow().date()))
        grid_model._model_json['output']['coefficients_table'].as_data_frame().to_csv("coefficients-{}.csv"
                                                                                      .format(datetime.utcnow().date()))
        mlflow.log_artifact(os.getcwd() + "/coefficients-{}.csv".format(datetime.utcnow().date()))

    return "Finished logging artifacts"
def run_auto_ml(_df_train,
                _max_runtime_secs=3600,
                _nfolds=10,
                _stopping_metric='mse',
                _sort_metric='mae',
                _exclude_algos=['DeepLearning']):
    """

    :param _df_train: train DataFrames with all features
    :param _max_runtime_secs: Int, number of seconds that our auto ml will be learn
    :param _nfolds: Int, number of fold
    :param _stopping_metric: Stop metrics for algo
    :param _sort_metric: Sort metrics for algo
    :param _exclude_algos: Excluded algo, in default DeepLearning
    """

    print(
        '>>>>>>>>>>>>>> Preparing model and data for model: --{0}min--'.format(
            _max_runtime_secs / 60))
    hf_train = h2o.H2OFrame(_df_train)

    aml = H2OAutoML(max_runtime_secs=_max_runtime_secs,
                    nfolds=_nfolds,
                    exclude_algos=_exclude_algos,
                    sort_metric=_sort_metric)

    list_train_columns = list(_df_train.columns)
    list_train_columns.remove(features.PRICE)

    print('>>>>>>>>>>>>>> All set, starting training')
    aml.train(x=list_train_columns, y=features.PRICE, training_frame=hf_train)

    print('>>>>>>>>>>>>>> Finished training')
    saved_path = h2o.save_model(model=aml.leader,
                                path=paths.DIR_MODELS,
                                force=True)

    print('>>>>>>>>>>>>>> Model saved on path: {}'.format(saved_path))

    print('>>>>>>>>>>>>>> Models saved')
    print('\n\n')
    print(aml.leaderboard.head())
Ejemplo n.º 49
0
    def save(self):
        import h2o
        model_folder_path = os.path.join(SAVED_MODELS_PATH, self.model_id)
        metadata_path = os.path.join(model_folder_path, 'metadata.json')

        if not os.path.exists(metadata_path):
            os.makedirs(model_folder_path)

        model_path = h2o.save_model(self.model, path=model_folder_path, force=True)
        with open(metadata_path, 'w') as metadata_file:
            json.dump({
                'system': self.system,
                'model_id': self.model_id,
                'search_id': self.search_id,
                'model_filename': os.path.basename(model_path),
                'predictors': self.predictors,
                'targets': self.targets,
                'train_specification': self.train_specification,
                'task': self.task
            }, metadata_file)
def milsong_checkpoint():

    milsong_train = h2o.upload_file(tests.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(tests.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))

    # build first model
    ntrees1 = random.sample(range(50,100),1)[0]
    max_depth1 = random.sample(range(2,6),1)[0]
    min_rows1 = random.sample(range(10,16),1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234)

    # save the model, then load the model
    path = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","results"))

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(model1, path=path, force=True)

    assert os.path.isdir(model_path), "Expected load directory {0} to exist, but it does not.".format(model_path)
    restored_model = h2o.load_model(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                               checkpoint=restored_model._id,seed=1234)

    # build the equivalent of model 2 in one shot
    model3 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234)

    assert isinstance(model2,type(model3))
    assert model2.mse(valid=True)==model3.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model3.mse(valid=True))
Ejemplo n.º 51
0
 def fit_transform(self, X: dt.Frame, y: np.array = None):
     h2o.init()
     model = H2OAutoEncoderEstimator(activation='tanh',
                                     epochs=1,
                                     hidden=[50, 50],
                                     reproducible=True,
                                     seed=1234)
     frame = h2o.H2OFrame(X.to_pandas())
     model_path = None
     try:
         model.train(x=list(range(X.ncols)),
                     training_frame=frame,
                     model_id=self.id)
         model_path = h2o.save_model(model=model)
         with open(model_path, "rb") as f:
             self.raw_model_bytes = f.read()
         return model.anomaly(frame).as_data_frame(header=False)
     finally:
         if model_path is not None:
             os.remove(model_path)
         h2o.remove(self.id)
Ejemplo n.º 52
0
def test_hdfs_io():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    print("Importing hdfs data")
    h2o_data = h2o.import_file("maprfs://" + hdfs_name_node +
                               "/datasets/airlines/airlines_all.05p.csv")

    print("Spliting data")
    for c in ["Month", "DayofMonth", "IsArrDelayed"]:
        h2o_data[c] = h2o_data[c].asfactor()
    myX = ["Month", "DayofMonth", "Distance"]
    train, test = h2o_data.split_frame(ratios=[0.9])

    print("Exporting file to hdfs")
    h2o.export_file(test[:, ["Year", "DayOfWeek"]],
                    "maprfs://" + hdfs_name_node + "/datasets/exported.csv")

    print("Reading file back in and comparing if data is the same")
    new_test = h2o.import_file("maprfs://" + hdfs_name_node +
                               "/datasets/exported.csv")
    assert ((test[:, "DayOfWeek"] - new_test[:, "DayOfWeek"]).sum() == 0)

    print("Training")
    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial",
                                            alpha=0.5,
                                            Lambda=0.01)
    h2o_glm.train(x=myX, y="IsArrDelayed",
                  training_frame=train)  # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    print("Saving model")
    # Does not understand maprfs:// for model saving?
    new_model_path = h2o.save_model(
        h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path)
    print("Loading back model")
    new_model = h2o.load_model(new_model_path)
    print("Running predictions")
    preds = new_model.predict(test)
Ejemplo n.º 53
0
 def fit_transform(self, X: dt.Frame, y: np.array = None):
     h2o.init(port=config.h2o_recipes_port)
     model = H2OAutoEncoderEstimator(activation='tanh',
                                     epochs=1,
                                     hidden=[50, 50],
                                     reproducible=True,
                                     seed=1234)
     frame = h2o.H2OFrame(X.to_pandas())
     model_path = None
     try:
         model.train(x=list(range(X.ncols)), training_frame=frame)
         self.id = model.model_id
         model_path = os.path.join(temporary_files_path,
                                   "h2o_model." + str(uuid.uuid4()))
         model_path = h2o.save_model(model=model, path=model_path)
         with open(model_path, "rb") as f:
             self.raw_model_bytes = f.read()
         return model.anomaly(frame).as_data_frame(header=False)
     finally:
         if model_path is not None:
             remove(model_path)
         h2o.remove(model)
def milsong_checkpoint():

    milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    distribution = "gaussian"

    # build first model
    ntrees1 = random.sample(list(range(50,100)),1)[0]
    max_depth1 = random.sample(list(range(2,6)),1)[0]
    min_rows1 = random.sample(list(range(10,16)),1)[0]
    print("ntrees model 1: {0}".format(ntrees1))
    print("max_depth model 1: {0}".format(max_depth1))
    print("min_rows model 1: {0}".format(min_rows1))
    model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])

    # save the model, then load the model
    path = pyunit_utils.locate("results")

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(model1, path=path, force=True)

    assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
    restored_model = h2o.load_model(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print("ntrees model 2: {0}".format(ntrees2))
    print("max_depth model 2: {0}".format(max_depth2))
    print("min_rows model 2: {0}".format(min_rows2))
    model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                     checkpoint=restored_model.model_id)

    # build the equivalent of model 2 in one shot
    model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
    def test_saved_binary_model_produces_same_predictions_as_original():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)
        se_model = train_stacked_ensemble(ds, base_models)
        
        #Predict in ensemble in Py client
        preds_py = se_model.predict(ds.test)
        
        tmp_dir = tempfile.mkdtemp()
        try:
            bin_file = h2o.save_model(se_model, tmp_dir)
            #Load binary model and predict
            bin_model = h2o.load_model(pu.locate(bin_file))
            preds_bin = bin_model.predict(ds.test)
        finally:
            shutil.rmtree(tmp_dir)

        #Predictions from model in Py and binary model should be the same
        pred_diff = preds_bin - preds_py
        assert pred_diff["p0"].max() < 1e-11
        assert pred_diff["p1"].max() < 1e-11
        assert pred_diff["p0"].min() > -1e-11
        assert pred_diff["p1"].min() > -1e-11
Ejemplo n.º 56
0
def milsong_checkpoint(ip,port):

    milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))

    # build first model
    ntrees1 = random.sample(range(50,100),1)[0]
    max_depth1 = random.sample(range(2,6),1)[0]
    min_rows1 = random.sample(range(10,16),1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234)

    # save the model, then load the model
    model_path = h2o.save_model(model1,force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                               checkpoint=restored_model._id,seed=1234)

    # build the equivalent of model 2 in one shot
    model3 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234)

    assert isinstance(model2,type(model3))
    assert model2.mse(valid=True)==model3.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model3.mse(valid=True))
def cars_checkpoint():

    cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars.runif()
    train = cars[s > .2]
    valid = cars[s <= .2]

    print("\n*** Description (chunk distribution, etc) of training frame:")
    train.describe()
    print("\n*** Description (chunk distribution, etc) of validation frame:")
    valid.describe()

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(list(range(3)),1)[0]

    # pick the predictors and response column, along with the correct distribution
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        distribution = "bernoulli"
        train[response_col] = train[response_col].asfactor()
        valid[response_col] = valid[response_col].asfactor()
    elif problem == 2 :
        response_col = "cylinders"
        distribution = "multinomial"
        train[response_col] = train[response_col].asfactor()
        valid[response_col] = valid[response_col].asfactor()
    else              :
        response_col = "economy"
        distribution = "gaussian"

    print("\n*** Distribution: {0}".format(distribution))
    print("\n*** Response column: {0}".format(response_col))

    # build first model
    ntrees1 = 5
    max_depth1 = random.sample(list(range(2,6)),1)[0]
    min_rows1 = random.sample(list(range(10,16)),1)[0]
    print("\n*** Building model 1 with the following parameters:")
    print("*** ntrees model 1: {0}".format(ntrees1))
    print("*** max_depth model 1: {0}".format(max_depth1))
    print("*** min_rows model 1: {0}".format(min_rows1))

    from h2o.estimators.gbm import H2OGradientBoostingEstimator
    model1 = H2OGradientBoostingEstimator(ntrees=ntrees1,
                                          max_depth=max_depth1,
                                          min_rows=min_rows1,
                                          score_each_iteration=True,
                                          distribution=distribution)
    model1.train(x=predictors, y=response_col,training_frame=train,validation_frame=valid)

    # model1 = h2o.gbm(x=train[predictors],
    #                  y=train[response_col],
    #                  ntrees=ntrees1,
    #                  max_depth=max_depth1,
    #                  min_rows=min_rows1,
    #                  score_each_iteration=True,
    #                  distribution=distribution,
    #                  validation_x=valid[predictors],
    #                  validation_y=valid[response_col])

    # save the model, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree("delete_model")

    # continue building the model
    ntrees2 = ntrees1 + 5
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print("\n*** Continuing to build model 1 (now called model 2) with the following parameters:")
    print("*** ntrees model 2: {0}".format(ntrees2))
    print("*** max_depth model 2: {0}".format(max_depth2))
    print("*** min_rows model 2: {0}".format(min_rows2))

    model2 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                          max_depth=max_depth2,
                                          min_rows=min_rows2,
                                          distribution=distribution,
                                          score_each_iteration=True,
                                          checkpoint=restored_model._id)
    model2.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)

    # model2 = h2o.gbm(x=train[predictors],
    #                  y=train[response_col],
    #                  ntrees=ntrees2,
    #                  max_depth=max_depth2,
    #                  min_rows=min_rows2,
    #                  distribution=distribution,
    #                  score_each_iteration=True,
    #                  validation_x=valid[predictors],
    #                  validation_y=valid[response_col],
    #                  checkpoint=restored_model._id)

    # continue building the model, but with different number of trees
    ntrees3 = ntrees2 + 50
    max_depth3 = max_depth1
    min_rows3 = min_rows1
    print("\n*** Continuing to build model 1 (now called model 3) with the following parameters:")
    print("*** ntrees model 3: {0}".format(ntrees3))
    print("*** max_depth model 3: {0}".format(max_depth3))
    print("*** min_rows model 3: {0}".format(min_rows3))

    model3 = H2OGradientBoostingEstimator(ntrees=ntrees3,
                                          max_depth=max_depth3,
                                          min_rows=min_rows3,
                                          distribution=distribution,
                                          score_each_iteration=True,
                                          checkpoint=restored_model._id)
    model3.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)

    # model3 = h2o.gbm(x=train[predictors],
    #                  y=train[response_col],
    #                  ntrees=ntrees3,
    #                  max_depth=max_depth3,
    #                  min_rows=min_rows3,
    #                  distribution=distribution,
    #                  score_each_iteration=True,
    #                  validation_x=valid[predictors],
    #                  validation_y=valid[response_col],
    #                  checkpoint=restored_model._id)

    # build the equivalent of model 2 in one shot
    print("\n*** Building the equivalent of model 2 (called model 4) in one shot:")

    model4 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                          max_depth=max_depth2,
                                          min_rows=min_rows2,
                                          distribution=distribution,
                                          score_each_iteration=True)
    model4.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)

    # model4 = h2o.gbm(x=train[predictors],
    #                  y=train[response_col],
    #                  ntrees=ntrees2,
    #                  max_depth=max_depth2,
    #                  min_rows=min_rows2,
    #                  distribution=distribution,
    #                  score_each_iteration=True,
    #                  validation_x=valid[predictors],
    #                  validation_y=valid[response_col])

    print("\n*** Model Summary for model 2:")
    print(model2.summary())
    print("\n*** Model Summary for model 3:")
    print(model3.summary())
    print("\n*** Model Summary for model 4:")
    print(model4.summary())

    print("\n*** Score History for model 2:")
    print(model2.scoring_history())
    print("\n*** Score History for model 3:")
    print(model3.scoring_history())
    print("\n*** Score History for model 4:")
    print(model4.scoring_history())

    # checks
    if problem == 0:
        assert isinstance(model2,type(model4))
        assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True))
        #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True))

    elif problem == 1:
        assert isinstance(model2,type(model4))
        assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True))
        #assert model3.auc(valid=True)!=model4.auc(valid=True), "Expected Model 3 AUC: {0} to be different from Model 4 AUC: {1}".format(model3.auc(valid=True), model4.auc(valid=True))

        assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))
        #assert model3.logloss(valid=True)!=model4.logloss(valid=True), "Expected Model 3 Log Loss: {0} to be different from Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))

        assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))
        #assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True), "Expected Model 3 Gini Coef: {0} to be different from Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))

    else:
        assert isinstance(model2,type(model4))
        assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True))
        #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True))

        assert model2.r2(valid=True)==model4.r2(valid=True), "Expected Model 2 R2: {0} to be the same as Model 4 R2: {1}".format(model2.r2(valid=True), model4.r2(valid=True))
test_y = test[y].as_data_frame()[0][1:]
print "Testing error is %f" %(sum(map(lambda t: t[0] != t[1],zip(test_pred,test_y)))*1.0/len(test_y))

# Take a look at the predictions of 10 predictions
pred.head()

# Show prediction and original test data Y values in list
# pred_list = pred.as_data_frame()[0]
# test_list = test.as_data_frame()[-1]

# the model object can not be pickled dump to a file because it is an
    # instance method

# Show top 20 variable importance
model.varimp()[:20]
model_path = h2o.save_model(model, './mnist_dp_model/') # one can save it to local, s3, hdfs
h2o.loadModel(model_path) #model is saved in a folder specifies in model_path
#############


##############################################################################
# Perform 5-fold cross-validation on training_frame
model_cv = H2ODeepLearningEstimator(distribution="multinomial",
                activation="RectifierWithDropout",
                hidden=[32,32,32],
                input_dropout_ratio=0.2,
                sparse=True,
                l1=1e-5,
                epochs=10,
                nfolds=5)
Ejemplo n.º 59
0
features = list(range(0,784))
target = 784

train[target] = train[target].asfactor()
valid[target] = valid[target].asfactor()

# Build model
model = H2ODeepWaterEstimator(epochs=20, activation="Rectifier", hidden=[200,200], ignore_const_cols=False, mini_batch_size=256, input_dropout_ratio=0.1, hidden_dropout_ratios=[0.5,0.5], stopping_rounds=3, stopping_tolerance=0.05, stopping_metric="misclassification", score_interval=2, score_duty_cycle=0.5, score_training_samples=1000, score_validation_samples=1000, gpu=True, seed=1234)

model.train(x=features, y=target, training_frame=train, validation_frame=valid)

# Evaluate model
model.show()
print(model.scoring_history())

# Checkpoint model
model_path = h2o.save_model(model=model, force=True)

# Load model
model_ckpt = h2o.load_model(model_path)

# Start training from checkpoint
model_warm = H2ODeepWaterEstimator(checkpoint=model_ckpt.model_id, epochs=100, activation="Rectifier", hidden=[200,200], ignore_const_cols=False, mini_batch_size=256, input_dropout_ratio=0.1, hidden_dropout_ratios=[0.5,0.5], stopping_rounds=3, stopping_tolerance=0.05, stopping_metric="misclassification", score_interval=2, score_duty_cycle=0.5, score_training_samples=1000, score_validation_samples=1000, gpu=True, seed=1234)

model_warm.train(x=features, y=target, training_frame=train, validation_frame=valid)

# Evaluate checkpointed model
model_warm.show()
print(model_warm.scoring_history())
Ejemplo n.º 60
0
h2o.init()

data = h2o.import_file(path='featurized_data')
y_labels = np.load('labels')
y_labels = (y_labels > 5) + 0 # as type essentially
y_labels = np.reshape(y_labels, (len(y_labels), 1))

data = data.cbind(h2o.H2OFrame(y_labels))

rand = data.runif()
train = data[r < 0.6]
valid = data[(r >= 0.6) & (r < 0.9)]
test = data[r >= 0.9]

y = data.col_names()[-1]
x = data.col_names()[:-1]

gbm = h2o.gbm(x = x, y = y, training_frame = train, validation_frame = valid, max_depth = 5, ntrees=500, learn_rate=0.2, distribution="bernoulli")

print gbm

r = int(random.random() * len(y_labels))
sample = data[r,:]
prediction = h2o.predict(gbm, sample)
print prediction

h2o.save_model(gbm, "model")