def test_hdfs_io():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    print("Importing hdfs data")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/airlines/airlines_all.05p.csv")

    print("Spliting data")
    for c in ["Month","DayofMonth","IsArrDelayed"]:
        h2o_data[c] = h2o_data[c].asfactor()
    myX = ["Month","DayofMonth","Distance"]
    train,test = h2o_data.split_frame(ratios=[0.9])

    print("Exporting file to hdfs")
    h2o.export_file(test[:,["Year","DayOfWeek"]], "hdfs://" + hdfs_name_node + "/datasets/exported.csv")

    print("Reading file back in and comparing if data is the same")
    new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv")
    assert((test[:,"DayOfWeek"] - new_test[:,"DayOfWeek"]).sum() == 0)

    print("Training")
    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    h2o_glm.train(x=myX, y="IsArrDelayed", training_frame=train) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    print("Saving model")
    new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path)
    print("Loading back model")
    new_model = h2o.load_model(new_model_path)
    print("Running predictions")
    preds = new_model.predict(test)
def test_load_glrm():
  print("Importing iris_wheader.csv data...")
  irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  irisH2O.describe()

  g_model = H2OGeneralizedLowRankEstimator(k=3)
  g_model.train(x=irisH2O.names, training_frame=irisH2O)
  yarch_old = g_model.archetypes()
  x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"])
  predOld = g_model.predict(irisH2O)
  TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results"))

  try:
    TMPDIR = pyunit_utils.locate("results")    # find directory path to results folder
  except:
    os.makedirs(TMPDIR)
  h2o.save_model(g_model, path=TMPDIR, force=True)       # save model
  full_path_filename = os.path.join(TMPDIR, g_model._id)

  h2o.remove(g_model)
  model_reloaded = h2o.load_model(full_path_filename)
  pred = model_reloaded.predict(irisH2O)
  yarch = model_reloaded.archetypes()
  x = h2o.get_frame(model_reloaded._model_json["output"]["representation_name"])

  # assert difference between old and new are close, archetypes should be the same
  pyunit_utils.compare_frames_local(x, x_old, tol=1e-6)
  pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1)
  for k in range(3):
    pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps = 1e-4, tolerance=1e-10)

  print("glrm model successfully loaded...")
Esempio n. 3
0
def test_hadoop():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    print("Importing hdfs data")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv")

    print("Spliting data")
    train,test = h2o_data.split_frame(ratios=[0.9])

    print("Exporting file to hdfs")
    h2o.export_file(test[:,0:2], "hdfs://" + hdfs_name_node + "/datasets/exported.csv")

    print("Reading file back in and comparing if data is the same")
    new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv")
    assert((test[:,1] - new_test[:,1]).sum() == 0)

    print("Training")
    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    h2o_glm.train(x=range(1, 10), y=0, training_frame=train) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    print("Saving model")
    new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path)
    print("Loading back model")
    new_model = h2o.load_model(new_model_path)
    print("Running predictions")
    preds = new_model.predict(test)
Esempio n. 4
0
def milsong_checkpoint(ip,port):

    milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    distribution = "gaussian"

    # build first model
    ntrees1 = random.sample(range(50,100),1)[0]
    max_depth1 = random.sample(range(2,6),1)[0]
    min_rows1 = random.sample(range(10,16),1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])

    # save the model, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree("delete_model")

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                     checkpoint=restored_model._id)

    # build the equivalent of model 2 in one shot
    model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
  def __init__(self, model_name, model_base_path, verbose=False):
    """
    Initialize the service.
        
    Args:
      model_name: The name of the model.
      model_base_path: The file path of the model.
    Return:
      None
    """
    super(H2oInferenceService, self).__init__()

    self.model_name = model_name
    self.model_base_path = model_base_path
    self.model_version_list = [1]
    self.model_graph_signature = ""
    self.platform = "H2o"
    self.verbose = verbose

    import h2o

    logging.info("Try to initialize and connect the h2o server")
    h2o.init()

    logging.info("Try to load the h2o model")
    model = h2o.load_model(model_base_path)

    self.model = model
    # TODO: Update the signature with readable string
    self.model_graph_signature = "{}".format(self.model.full_parameters)
def deepwater_checkpoint():
  if not H2ODeepWaterEstimator.available(): return

  ## build a model
  #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
  frame.drop(0)
  frame[1] = frame[1].asfactor()
  print(frame.head(5))
  model = H2ODeepWaterEstimator(epochs=50, learning_rate=1e-5, stopping_rounds=0, score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0)
  model.train(y=1, training_frame=frame)

  ## save the model
  model_path = h2o.save_model(model)

  ## delete everything - simulate cluster shutdown and restart
  h2o.remove_all()

  ## reimport the model and the frame
  model = h2o.load_model(model_path)
  #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
  frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
  frame.drop(0)
  frame[1] = frame[1].asfactor()
  
  ## delete the checkpoint file
  os.remove(model_path)

  ## continue training
  model2 = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-5, stopping_rounds=0,score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0, checkpoint=model.model_id)
  model2.train(y=1, training_frame=frame)
  model2.show()
def stackedensemble_binary_test():
    # Import a sample binary outcome train/test set into H2O
    train = h2o.import_file(pyunit_utils.locate("smalldata/higgs/higgs_train_10k.csv"))
    test = h2o.import_file(pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv"))

    # Identify predictors and response
    x = train.columns
    y = "response"
    x.remove(y)

    # For binary classification, response should be a factor
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()

    # Number of CV folds (to generate level-one data for stacking)
    nfolds = 5

    # 1. Generate a 2-model ensemble (GBM + RF)

    # Train and cross-validate a GBM
    my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                          ntrees=10,
                                          max_depth=3,
                                          min_rows=2,
                                          learn_rate=0.2,
                                          nfolds=nfolds,
                                          fold_assignment="Modulo",
                                          keep_cross_validation_predictions=True,
                                          seed=1)
    my_gbm.train(x=x, y=y, training_frame=train)


    # Train and cross-validate a RF
    my_rf = H2ORandomForestEstimator(ntrees=50,
                                     nfolds=nfolds,
                                     fold_assignment="Modulo",
                                     keep_cross_validation_predictions=True,
                                     seed=1)
    my_rf.train(x=x, y=y, training_frame=train)


    # Train a stacked ensemble using the GBM and DRF above
    ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial",
                                           base_models=[my_gbm.model_id, my_rf.model_id])
    ensemble.train(x=x, y=y, training_frame=train)

    #Predict in ensemble in Py client
    preds_py =  ensemble.predict(test)

    #Load binary model and predict
    bin_model = h2o.load_model(pyunit_utils.locate("smalldata/binarymodels/stackedensemble/ensemble_higgs"))
    preds_bin = bin_model.predict(test)

    #Predictions from model in Py and binary model should be the same
    pred_diff = preds_bin - preds_py
    assert pred_diff["p0"].max() < 1e-11
    assert pred_diff["p1"].max() < 1e-11
    assert pred_diff["p0"].min() > -1e-11
    assert pred_diff["p1"].min() > -1e-11
def milsong_checkpoint():

  milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
  milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
  distribution = "gaussian"

  # build first model
  ntrees1 = random.sample(range(50,100),1)[0]
  max_depth1 = random.sample(range(2,6),1)[0]
  min_rows1 = random.sample(range(10,16),1)[0]
  print "ntrees model 1: {0}".format(ntrees1)
  print "max_depth model 1: {0}".format(max_depth1)
  print "min_rows model 1: {0}".format(min_rows1)


  model1 = H2OGradientBoostingEstimator(ntrees=ntrees1,
                                        max_depth=max_depth1,
                                        min_rows=min_rows1,
                                        distribution=distribution)
  model1.train(x=range(1,milsong_train.ncol),
               y=0,
               training_frame=milsong_train,
               validation_frame=milsong_valid)

  # save the model, then load the model
  path = pyunit_utils.locate("results")

  assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
  model_path = h2o.save_model(model1, path=path, force=True)

  assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
  restored_model = h2o.load_model(model_path)

  # continue building the model
  ntrees2 = ntrees1 + 50
  max_depth2 = max_depth1
  min_rows2 = min_rows1
  print "ntrees model 2: {0}".format(ntrees2)
  print "max_depth model 2: {0}".format(max_depth2)
  print "min_rows model 2: {0}".format(min_rows2)
  model2 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                        max_depth=max_depth2,
                                        min_rows=min_rows2,
                                        distribution=distribution,
                                        checkpoint=restored_model.model_id)
  model2.train(x=range(1,milsong_train.ncol),
               y=0,
               training_frame=milsong_train,
               validation_frame=milsong_valid)

  model3 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                        max_depth=max_depth2,
                                        min_rows=min_rows2,
                                        distribution=distribution)

  model3.train(x=range(1,milsong_train.ncol),
               y=0,
               training_frame=milsong_train,
               validation_frame=milsong_valid)
Esempio n. 9
0
def _load_model(path, init=False):
    import h2o
    path = os.path.abspath(path)
    with open(os.path.join(path, "h2o.yaml")) as f:
        params = yaml.safe_load(f.read())
    if init:
        h2o.init(**(params["init"] if "init" in params else {}))
        h2o.no_progress()
    return h2o.load_model(os.path.join(path, params['model_file']))
Esempio n. 10
0
    def post(self, request):
        try:
            # Recebe os dados enviados pela requisição
            model_id = request.POST.get('model_id')
            csv_prever = request.FILES['csv_prever']

            if model_id:
                # Busca o modelo usando o ORM do Django pelo model_id
                modelo_processado = ModeloMachineLearningProcessado.objects.get(
                    model_id=model_id)

                # Busca o processamento vinculado ao modelo processado usando a ORM do Django
                processamento = modelo_processado.processamentos.first()
            else:
                # se não informou o model_id, busca-se o melhor modelo do processamento mais recente
                processamento = ProcessamentoModeloMachineLearning.objects.all(
                ).first()  # o processamento mais recente
                modelo_processado = processamento.modelos_processados.all(
                ).first()
                # o primeiro modelo sempre é o melhor

            # faz a leitura do arquivo para previsão fazendo uso da biblioteca Pandas
            teste = pd.read_csv(csv_prever, sep=";")
            colunas_enviadas = ','.join(teste.columns.tolist())
            if processamento.variaveis_independentes != colunas_enviadas:
                raise Exception(
                    'Erro no layout do arquivo de previsão: Para este modelo são essenciais as seguintes '
                    'colunas:"{variaveis_independentes}", mas você enviou as colunas: "{colunas_enviadas}"'
                    .format(variaveis_independentes=processamento.
                            variaveis_independentes,
                            colunas_enviadas=colunas_enviadas))

            # Inicializa a conexão com o h2o
            h2o.init()
            teste = h2o.H2OFrame(teste)

            # Fazer o load do binário do modelo
            modelo_automl = h2o.load_model(
                modelo_processado.binario_modelo.name)
            prever = modelo_automl.predict(teste)

            data_frame = prever.as_data_frame()
            """
                Formatar os dados de uma forma mais simples, para percorrer depois no JavaScript
            """
            previsoes = list()

            for i in range(0, len(data_frame['predict']) - 1):
                previsoes.append({
                    'predict': data_frame['predict'][i],
                    'p0': data_frame['p0'][i],
                    'p1': data_frame['p1'][i]
                })

            return Response(status=201, data={'previsoes': previsoes})
        except Exception as e:
            return Response(status=401, data={'Erro': str(e)})
Esempio n. 11
0
    def __init__(self):

        print 'Starting Java virtual machine'
        h2o.init(nthreads=-1, max_mem_size=8)
        print 'Machine started!'

        print 'Loading model from %s...' % MODEL_PATH
        self.model = h2o.load_model(MODEL_PATH)
        print 'Model Loaded'
Esempio n. 12
0
def run_h2o(train_file_path: str,
            test_file_path: str,
            task: MachineLearningTasksEnum,
            case_name='h2o_default'):
    config_data = get_models_hyperparameters()['H2O']
    max_models = config_data['MAX_MODELS']
    max_runtime_secs = config_data['MAX_RUNTIME_SECS']

    result_filename = f'{case_name}_m{max_models}_rs{max_runtime_secs}_{task.name}'
    exported_model_path = os.path.join(CURRENT_PATH, result_filename)

    # TODO Regression
    if result_filename not in os.listdir(CURRENT_PATH):
        train_data = InputData.from_csv(train_file_path)
        best_model = fit_h2o(train_data)
        temp_exported_model_path = h2o.save_model(model=best_model,
                                                  path=CURRENT_PATH)

        os.renames(temp_exported_model_path, exported_model_path)

    ip, port = get_h2o_connect_config()
    h2o.init(ip=ip, port=port, name='h2o_server')

    imported_model = h2o.load_model(exported_model_path)

    test_frame = InputData.from_csv(test_file_path)
    true_target = test_frame.target

    predictions = predict_h2o(imported_model, test_frame)

    if task is MachineLearningTasksEnum.classification:
        train_roc_auc_value = round(imported_model.auc(train=True), 3)
        valid_roc_auc_value = round(imported_model.auc(valid=True), 3)
        test_roc_auc_value = round(roc_auc_score(true_target, predictions), 3)

        metrics = {
            'H2O_ROC_AUC_train': train_roc_auc_value,
            'H2O_ROC_AUC_valid': valid_roc_auc_value,
            'H2O_ROC_AUC_test': test_roc_auc_value
        }

        print(f"H2O_ROC_AUC_train: {metrics['H2O_ROC_AUC_train']}")
        print(f"H2O_ROC_AUC_valid: {metrics['H2O_ROC_AUC_valid']}")
        print(f"H2O_ROC_AUC_test: {metrics['H2O_ROC_AUC_test']}")
    else:
        mse_train = imported_model.mse()
        rmse_train = imported_model.rmse()

        metrics = {'H2O_MSE_train': mse_train, 'H2O_RMSE_train': rmse_train}

        print(f"H2O_MSE_train: {metrics['H2O_MSE_train']}")
        print(f"H2O_RMSE_train: {metrics['H2O_RMSE_train']}")

    h2o.shutdown(prompt=False)

    return metrics
Esempio n. 13
0
def test_modelselection_serialization():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    allsubsets_model = modelSelection(seed=12345,
                                      max_predictor_number=7,
                                      mode="allsubsets")
    allsubsets_model.train(training_frame=d, x=my_x, y=my_y)
    tmpdir = tempfile.mkdtemp()
    model_path_allsubsets = allsubsets_model.download_model(tmpdir)
    maxr_model = modelSelection(seed=12345,
                                max_predictor_number=7,
                                mode="maxr")
    maxr_model.train(training_frame=d, x=my_x, y=my_y)
    model_path_maxr = maxr_model.download_model(tmpdir)

    h2o.remove_all()
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    loaded_allsubsets_model = h2o.load_model(model_path_allsubsets)
    result_frame_allsubsets = loaded_allsubsets_model.result()
    numRows = result_frame_allsubsets.nrows
    modelIDs_allsubsets = loaded_allsubsets_model._model_json["output"][
        "best_model_ids"]
    loaded_maxr_model = h2o.load_model(model_path_maxr)
    modelIDs_maxr = loaded_allsubsets_model._model_json["output"][
        "best_model_ids"]
    for ind in list(range(numRows)):
        model_from_frame_allsubsets = h2o.get_model(
            result_frame_allsubsets["model_id"][ind, 0])
        pred_frame_allsubsets = model_from_frame_allsubsets.predict(d)
        model_from_id_allsubsets = h2o.get_model(
            modelIDs_allsubsets[ind]['name'])
        pred_id_allsubsets = model_from_id_allsubsets.predict(d)
        pyunit_utils.compare_frames_local(pred_frame_allsubsets,
                                          pred_id_allsubsets,
                                          prob=1)
        model_from_id_maxr = h2o.get_model(modelIDs_maxr[ind]['name'])
        pred_id_maxr = model_from_id_maxr.predict(d)
        pyunit_utils.compare_frames_local(pred_frame_allsubsets,
                                          pred_id_maxr,
                                          prob=1)
Esempio n. 14
0
def h2o_pred(test):
    h2oTest = h2o.H2OFrame(test)
    saved_model = h2o.load_model(os.getcwd()+"/XGBoost_1_AutoML_20181105_211213")
    preds = saved_model.predict(h2oTest)
    preds = preds.as_data_frame()
    print(saved_model.model_performance(h2oTest))
    print("========================================================")
    print("Saving prediction into csv file")
    preds.to_csv("test_predict.csv")
    return preds
Esempio n. 15
0
    def load(self, path):
        try:
            import h2o
        except ImportError:
            raise MissingDependencyException(
                "h2o package is required to use H2oModelArtifact")

        h2o.init()
        model = h2o.load_model(self._model_file_path(path))
        return self.pack(model)
Esempio n. 16
0
 def _read(self):
     # need to init h2o context so as to use h2o.load_model
     utils.getH2oContext()
     # Load the first file under _file_path which should be the model file.
     # We can do this because if we change the model file, the hash of hash would change
     # the _file_path and then we would save the model under different path.
     h2o_model_file = os.listdir(self._file_path)[0]
     h2o_model_path = os.path.join(self._file_path, h2o_model_file)
     import h2o
     return h2o.load_model(h2o_model_path)
def milsong_checkpoint():

    milsong_train = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(
        pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    distribution = "gaussian"

    # build first model
    ntrees1 = random.sample(range(50, 100), 1)[0]
    max_depth1 = random.sample(range(2, 6), 1)[0]
    min_rows1 = random.sample(range(10, 16), 1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)

    from h2o.estimators.gbm import H2OGradientBoostingEstimator
    model1 = H2OGradientBoostingEstimator(ntrees=ntrees1,
                                          max_depth=max_depth1,
                                          min_rows=min_rows1,
                                          distribution=distribution)
    model1.train(x=range(1, milsong_train.ncol),
                 y=0,
                 training_frame=milsong_train,
                 validation_frame=milsong_valid)

    # save the model, then load the model
    path = pyunit_utils.locate("results")

    assert os.path.isdir(
        path), "Expected save directory {0} to exist, but it does not.".format(
            path)
    model_path = h2o.save_model(model1, path=path, force=True)

    assert os.path.isdir(
        model_path
    ), "Expected load directory {0} to exist, but it does not.".format(
        model_path)
    restored_model = h2o.load_model(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model1 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                          max_depth=max_depth2,
                                          min_rows=min_rows2,
                                          distribution=distribution)
    model1.train(x=range(1, milsong_train.ncol),
                 y=0,
                 training_frame=milsong_train,
                 validation_frame=milsong_valid)
Esempio n. 18
0
def milsong_checkpoint(ip, port):

    milsong_train = h2o.upload_file(
        h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(
        h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    distribution = "gaussian"

    # build first model
    ntrees1 = random.sample(range(50, 100), 1)[0]
    max_depth1 = random.sample(range(2, 6), 1)[0]
    min_rows1 = random.sample(range(10, 16), 1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.gbm(x=milsong_train[1:],
                     y=milsong_train[0],
                     ntrees=ntrees1,
                     max_depth=max_depth1,
                     min_rows=min_rows1,
                     distribution=distribution,
                     validation_x=milsong_valid[1:],
                     validation_y=milsong_valid[0])

    # save the model, then load the model
    model_path = h2o.save_model(model1, force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.gbm(x=milsong_train[1:],
                     y=milsong_train[0],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     validation_x=milsong_valid[1:],
                     validation_y=milsong_valid[0],
                     checkpoint=restored_model._id)

    # build the equivalent of model 2 in one shot
    model3 = h2o.gbm(x=milsong_train[1:],
                     y=milsong_train[0],
                     ntrees=ntrees2,
                     max_depth=max_depth2,
                     min_rows=min_rows2,
                     distribution=distribution,
                     validation_x=milsong_valid[1:],
                     validation_y=milsong_valid[0])
Esempio n. 19
0
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200  # number of test dataset rows
    df = pyunit_utils.random_dataset("regression",
                                     seed=1234)  # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types) - 1)]

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3,
                                               transform=transformN,
                                               max_iterations=10,
                                               seed=1234)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(
        glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel)  # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath('__file__')), "..",
                     "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(
        TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(
        glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True)  # save mojo predict

    h2o.save_model(glrmModel, TMPDIR)  # save GLRM model
    glrmModel2 = h2o.load_model(os.path.join(TMPDIR, MOJONAME))
    predict_model = glrmModel2.predict(test)
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
            predict_model[col] = predict_model[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
    print("Comparing mojo predict and h2o predict from saved model...")
    pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10)
    frameID, mojoXFactor = pyunit_utils.mojo_predict(
        glrmModel, TMPDIR, MOJONAME,
        glrmReconstruct=False)  # save mojo XFactor
    glrmTestFactor = h2o.get_frame(
        "GLRMLoading_" + frameID)  # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor,
                                      mojoXFactor,
                                      1,
                                      tol=1e-10)
Esempio n. 20
0
 def transform(self, X: dt.Frame):
     h2o.init()
     model_path = os.path.join(temporary_files_path, self.id)
     with open(model_path, "wb") as f:
         f.write(self.raw_model_bytes)
     model = h2o.load_model(model_path)
     os.remove(model_path)
     frame = h2o.H2OFrame(X.to_pandas())
     try:
         return model.anomaly(frame).as_data_frame(header=False)
     finally:
         h2o.remove(self.id)
Esempio n. 21
0
def load_model_predict(col_to_predict, test_data):
    if col_to_predict == 'animal':
        model_path = "mymodel_animal"
    elif col_to_predict == "item":
        model_path = "mymodel_item"
    elif col_to_predict == "destino":
        model_path = "mymodel_dest"

    h2o_test = h2o.H2OFrame(test_data)
    model = h2o.load_model(model_path)
    predictions = model.predict(h2o_test)
    return predictions.columns
Esempio n. 22
0
def load_h2o_model(local_dir, filename, extension=""):
    """
    Loads a saved H2O Model

    :param string local_dir: Local directory where the model is saved
    :param string filename: Filename with which the model is saved
    :param string extension: Extension to the filename with which the model is saved
    :return:
    """

    from h2o import load_model
    return load_model(local_dir + "/" + filename + extension)
Esempio n. 23
0
def save_load_model(ip,port):
    # Connect to h2o
    h2o.init(ip,port)

    prostate = h2o.import_frame(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial",
                           alpha = [0.5])
    model_path = h2o.save_model(prostate_glm, name="delete_model", force=True)
    the_model = h2o.load_model(model_path)

    assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
Esempio n. 24
0
def predict_activities(data_dict):
    """
    food_hours,	food_calories,	active_hours,	active_rating,	sleep_hours,	sleep_duration,	comfort_rating

    """
    h2o.init()
    h2o_model = h2o.load_model(os.path.join(BASE_DIR, os.path.join('h2o_models', 'xgboost-activities')))
    data_row = h2o.H2OFrame(data_dict,
                            column_names=['food_hours', 'food_calories', 'active_hours', 'active_rating', 'sleep_hours',
                                          'sleep_duration', 'comfort_rating'])
    data_prediction = h2o_model.predict(data_row)
    return data_prediction
Esempio n. 25
0
def save_load_model(ip,port):
    
    

    prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial",
                           alpha = [0.5])
    model_path = h2o.save_model(prostate_glm,force=True)
    the_model = h2o.load_model(model_path)
    shutil.rmtree(model_path)

    assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
Esempio n. 26
0
def save_load_model():
    
    

    prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial",
                           alpha = [0.5])
    model_path = h2o.save_model(prostate_glm,force=True)
    the_model = h2o.load_model(model_path)
    shutil.rmtree(model_path)

    assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
Esempio n. 27
0
def test_hadoop():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv")

    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    h2o_glm.train(x=range(1, 10), y=0, training_frame=h2o_data) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    h2o.save_model(h2o_glm, "hdfs://" + hdfs_model_path)

    new_model = h2o.load_model("hdfs://" + hdfs_model_path)
Esempio n. 28
0
def plot():
    model = h.load_model('D:/AI/AI_Hub/Head Unit Data/rf_covType_v1')
    var_im = (model.varimp(1))
    var_im = var_im[:5]
    sns_plot = sns.barplot(x=var_im.variable,
                           y=var_im.percentage,
                           palette="Blues_d")
    sns_plot.set_xticklabels(sns_plot.get_xticklabels(),
                             rotation=15,
                             fontsize=8)
    sns_plot.set(xlabel='Features', ylabel='Variable Importance')
    fig = sns_plot.get_figure()
    fig.savefig("D:/AI/static/plot.jpg")
    return render_template('plot.html')
Esempio n. 29
0
 def get_model(cls):
     """Get the model object for this instance,
     loading it if it's not already loaded."""
     if cls.model is None:
         for file in os.listdir(model_path):
             # Assumes that 'AutoML' is somewhere in the filename of a
             # model that's been generated. We just load the first model
             # that satisfies this constraint, so caveat emptor if you've
             # run the 'train' script multiple times - this may still load
             # the first model. An obvious to-do is to improve this :-)
             if 'GBM' in file:
                 cls.model = h2o.load_model(os.path.join(model_path, file))
                 break
     return cls.model
Esempio n. 30
0
 def predict(self):
         #Resuse the Saved Random Forest Model
         model = h2o.load_model('Misc/models/RF_Insurance_model/DRF_model_python_1591600273347_1')
         #Convert it to a Spark Dataframe
         customer = self.standardize(self.customer)
         org_df = self.sc.parallelize([customer]).toDF()
         df = self.transform_data(org_df)
         #Obtain a result using the saved model
         pred = model.predict(self.hc.asH2OFrame(df))
         prediction_df = self.hc.asSparkFrame(pred)
         prediction_df = prediction_df.withColumn("predict", functions.round("predict", 0))
         prediction_df = prediction_df.withColumn("predict", prediction_df["predict"].cast(types.IntegerType()))
         result = prediction_df.collect()[0].predict   
         return result
def test_auto_ml(_model, _df_test):
    print('>>>>>>>>>>>>>> Import model and test set')

    model_path = join(paths.DIR_MODELS, _model)
    model = h2o.load_model(model_path)
    hf_test = h2o.H2OFrame(_df_test)

    print('>>>>>>>>>>>>>> Predict results for test set')
    df_pred = model.predict(hf_test).as_data_frame()

    print('>>>>>>>>>>>>>> Calculate mean absolute error')
    m_a_e = mean_absolute_error(df_pred.values,
                                _df_test[features.PRICE].values)
    print('Mean absolute error:  {}'.format(m_a_e))
def save_load_model():
    prostate = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()

    prostate_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5])
    prostate_glm.train(x=["AGE","RACE","PSA","DCAPS"], y="CAPSULE", training_frame=prostate)
    path = pyunit_utils.locate("results")

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(prostate_glm, path=path, force=True)

    assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
    the_model = h2o.load_model(model_path)

    assert isinstance(the_model, H2OEstimator), "Expected and H2OBinomialModel, but got {0}".format(the_model)
Esempio n. 33
0
def get_prediction(model_path, iters, data, batch_size):
    h2o.init(nthreads=-1)
    model = h2o.load_model(model_path)
    prediction = []
    warnings.filterwarnings("ignore")
    t1= datetime.datetime.now()
    for i in range(0, iters+1):
        _data = data.ix[(i*batch_size):((i+1)*batch_size)]
        _data_h20 = h2o.H2OFrame(_data)
        pred = model.predict(_data_h20)
        prediction += list(pred.as_data_frame(use_pandas=True)['predict'])
    t2 = datetime.datetime.now()
    print("total run time :", round((t2 - t1).total_seconds() / 60, 2))
    h2o.shutdown(prompt = False)
    return prediction
def milsong_checkpoint():

    milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))

    # build first model
    ntrees1 = random.sample(range(50, 100), 1)[0]
    max_depth1 = random.sample(range(2, 6), 1)[0]
    min_rows1 = random.sample(range(10, 16), 1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = H2ORandomForestEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, seed=1234)

    model1.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)

    # save the model, then load the model
    path = pyunit_utils.locate("results")

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(model1, path=path, force=True)

    assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
    restored_model = h2o.load_model(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)

    model2 = H2ORandomForestEstimator(
        ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, checkpoint=restored_model._id, seed=1234
    )
    model2.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)

    # build the equivalent of model 2 in one shot
    model3 = H2ORandomForestEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, seed=1234)
    model3.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)

    assert isinstance(model2, type(model3))
    assert model2.mse(valid=True) == model3.mse(
        valid=True
    ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(
        model2.mse(valid=True), model3.mse(valid=True)
    )
def clv_clustering_scoring(new_customer_details):

    # Reading data from the sample csv file
    print("Processing Step 1 --> Reading in the sample data")
    all_data = pd.read_csv('..\\..\\99_sample_data\\custclv.csv')

    #h2o.shutdown()
    h2o.init(ip="127.0.0.1", https=True, insecure=True)  # initializing h2o
    predictors = [
        'rfm_score', 'Gender', 'Age', 'HouseType', 'ContactAvailability',
        'HomeCountry', 'CreditScore', 'CLV', 'MonthlyValue', 'ActiveMonths'
    ]

    # Data Cleaning
    all_data['Gender'] = pd.Categorical(all_data.Gender).codes
    all_data['ContactAvailability'] = pd.Categorical(
        all_data.ContactAvailability).codes
    all_data['HouseType'] = pd.Categorical(all_data.HouseType).codes
    all_data['HomeCountry'] = pd.Categorical(all_data.HomeCountry).codes
    all_data_h20 = h2o.H2OFrame(all_data)

    train, test = all_data_h20.split_frame([0.8], seed=123)
    train = train[:, 1:11]
    test = test[:, 1:11]

    # Loading the H2O model
    print("Processing Step 2 --> Loading the H2O model into the solution")
    estimator = h2o.load_model(
        "..\\..\\02_models\\KMeans_model_python_1537328280878_1")

    trained = estimator.predict(all_data_h20)
    all_data_h20['cluster'] = trained["predict"].asfactor()
    all_data_h20 = all_data_h20.as_data_frame()

    print("Processing Step 3 --> Scoring for a sample customer")
    sample_customer = test[2, :]
    print(sample_customer)
    predicted = estimator.predict(sample_customer)
    print("Predicted Cluster : ", predicted["predict"].asfactor())

    # Calculate average CLV of that cluster
    req_value = int(predicted["predict"].asfactor())
    req_data = all_data_h20[all_data_h20['cluster'] == req_value]
    avg_clv = req_data['CLV'].mean()
    print('CLV for new customer = ', str(avg_clv))
    print('Process Complete')

    return avg_clv
Esempio n. 36
0
def get_newest_model():
    # Find newest model
    model_paths = [path for path in os.listdir('models/') if 'automl_model' in path]
    model_dates = [model_path.split('automl_model_')[1] for model_path in model_paths]
    
    # Check that there is a model in the path
    assert len(model_paths) > 0
    
    # Get most recent model date and the path to the most recent model
    newest_model_date = max(model_dates)
    newest_model_idx = model_dates.index(newest_model_date)
    model_path_to_load = 'models/' + model_paths[newest_model_idx]
    
    # Load newest model
    aml = h2o.load_model(model_path_to_load)
    return aml
Esempio n. 37
0
def inference(model, myCSV, threshold=0.920060452296198):
    model_name = f"{config.MODEL_PATH}{model}"
    model = h2o.load_model(model_name)

    h2odf = h2o.H2OFrame(pd.read_csv(myCSV), destination_frame="testData.hex")
    df = h2odf.as_data_frame()
    predictions = model.predict(h2odf)
    # df['alert_h2o'] = predictions.as_data_frame().predict
    df['Probability_COVID19'] = predictions.as_data_frame().iloc[:, 2]
    df['COVID19_Status'] = df['Probability_COVID19'].map(
        lambda x: 1 if x <= threshold else 0)
    df['Probability_COVID19'] = 1 - df['Probability_COVID19']
    cols = df.columns.tolist()
    df = df[cols[-2:] + cols[:-2]]

    return df
Esempio n. 38
0
def save_load_model():

    prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()
    prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial",
                           alpha = [0.5])

    path = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results"))

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(prostate_glm, path=path, force=True)

    assert os.path.isdir(model_path), "Expected load directory {0} to exist, but it does not.".format(model_path)
    the_model = h2o.load_model(model_path)

    assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def download_model_filename():
    fr = h2o.import_file(
        path=pyunit_utils.locate("smalldata/prostate/prostate.csv"))

    model = H2OGradientBoostingEstimator(ntrees=10, seed=1234)
    model.train(x=list(range(2, fr.ncol)), y=1, training_frame=fr)

    # Default filename is model_id
    model_path = model.download_model()
    # It should be saved in server working directory
    assert model_path.endswith(
        model.model_id), "Not expected path: {0}".format(model_path)
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Default filename is model_id
    tmpdir = tempfile.mkdtemp()
    model_path = model.download_model(tmpdir)
    assert_equals(os.path.join(tmpdir, model.model_id), model_path,
                  "Not expected path")
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Custom filename with custom path
    model_path = model.download_model(tmpdir, filename="gbm_prostate")
    assert_equals(os.path.join(tmpdir, "gbm_prostate"), model_path,
                  "Not expected path")
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Custom filename with custom path
    model_path = model.download_model(tmpdir, filename="gbm_prostate.model")
    assert_equals(os.path.join(tmpdir, "gbm_prostate.model"), model_path,
                  "Not expected path")
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Custom filename with custom path
    model_path = model.download_model(tmpdir,
                                      filename=os.path.join(
                                          "not-existing-folder",
                                          "gbm_prostate.model"))
    assert_equals(
        os.path.join(tmpdir, "not-existing-folder", "gbm_prostate.model"),
        model_path, "Not expected path")
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)

    # Custom filename with default path
    model_path = model.download_model(filename="gbm_prostate2.model")
    assert model_path.endswith(
        "gbm_prostate2.model"), "Not expected path: {0}".format(model_path)
    loaded_model = h2o.load_model(model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator)
Esempio n. 40
0
def index():

    if request.method == 'GET':
        return render_template('index.html')

    elif request.method == 'POST':
        loan_amnt = float(request.form.get('loan_amnt', 5000))
        term = request.form.get('term', '36 months')
        emp_length = request.form.get('emp_length', '1 year')
        home_ownership = request.form.get('home_ownership', 'RENT')
        purpose = request.form.get('purpose', 'credit card')
        addr_state = request.form.get('addr_state', 'AL')
        annual_inc = float(request.form.get('annual_inc', 0))
        inq_last_6mths = float(request.form.get('inq_last_6mths', 0))

        col_names = [
            'loan_amnt', 'term', 'emp_length', 'home_ownership', 'purpose',
            'addr_state', 'annual_inc', 'inq_last_6mths'
        ]

        df = h2o.H2OFrame.from_python(
            [(loan_amnt, term, emp_length, home_ownership, purpose, addr_state,
              annual_inc, inq_last_6mths)],
            column_names=col_names)
        # Convert string variables into factors
        string_vars = [i[0] for i in df.types.iteritems() if i[1] == 'string']
        for var in string_vars:
            df[var] = df[var].asfactor()

        saved_model = h2o.load_model(
            '/assets/flask_deployment_demo/GBM_model_python_1515678740025_16')
        predicted = saved_model.predict(df)
        predicted_df = predicted.as_data_frame()
        default_prob = round(predicted_df['default'][0], 3)

        # Append "predicted scores" to original DF.
        df_predictions = df.cbind(predicted)

        if default_prob < 0.5:
            result = 'Likely to not default'
        else:
            result = 'Likely to default'

        return render_template('index.html', result=result, default_prob=default_prob, \
                loan_amnt=loan_amnt, term=term, emp_length=emp_length, home_ownership=home_ownership, \
                purpose=purpose, addr_state=addr_state, annual_inc=annual_inc, inq_last_6mths=inq_last_6mths)
Esempio n. 41
0
    def transform(self, X: dt.Frame):
        h2o.init(port=config.h2o_recipes_port)
        model_path = os.path.join(temporary_files_path, self.id)
        with open(model_path, "wb") as f:
            f.write(self.raw_model_bytes)
        model = h2o.load_model(os.path.abspath(model_path))
        remove(model_path)
        frame = h2o.H2OFrame(X.to_pandas())
        anomaly_frame = None

        try:
            anomaly_frame = model.anomaly(frame)
            anomaly_frame_df = anomaly_frame.as_data_frame(header=False)
            return anomaly_frame_df
        finally:
            h2o.remove(self.id)
            h2o.remove(anomaly_frame)
def predict_churn(State, AccountLength, AreaCode, Phone, IntlPlan, VMailPlan,
                  VMailMessage, DayMins, DayCalls, DayCharge, EveMins,
                  EveCalls, EveCharge, NightMins, NightCalls, NightCharge,
                  IntlMins, IntlCalls, IntlCharge, CustServCalls):
    # connect to the model scoring service
    h2o.connect()

    # open the downloaded model
    ChurnPredictor = h2o.load_model(path='AutoML-leader')

    # define a feature vector to evaluate with the model
    newData = pd.DataFrame(
        {
            'State': State,
            'Account Length': AccountLength,
            'Area Code': AreaCode,
            'Phone': Phone,
            'Int\'l Plan': IntlPlan,
            'VMail Plan': VMailPlan,
            'VMail Message': VMailMessage,
            'Day Mins': DayMins,
            'Day Calls': DayCalls,
            'Day Charge': DayCharge,
            'Eve Mins': EveMins,
            'Eve Calls': EveCalls,
            'Eve Charge': EveCharge,
            'Night Mins': NightMins,
            'Night Calls': NightCalls,
            'Night Charge': NightCharge,
            'Intl Mins': IntlMins,
            'Intl Calls': IntlCalls,
            'Intl Charge': IntlCharge,
            'CustServ Calls': CustServCalls
        },
        index=[0])

    # evaluate the feature vector using the model
    predictions = ChurnPredictor.predict(h2o.H2OFrame(newData))
    predictionsOut = h2o.as_list(predictions, use_pandas=False)
    prediction = predictionsOut[1][0]
    probabilityChurn = predictionsOut[1][1]
    probabilityRetain = predictionsOut[1][2]
    return "Prediction: " + str(prediction) + " |Probability to Churn: " + str(
        probabilityChurn) + " |Probability to Retain: " + str(
            probabilityRetain)
def deepwater_checkpoint():
    if not H2ODeepWaterEstimator.available(): return

    ## build a model
    #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
    frame = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    frame.drop(0)
    frame[1] = frame[1].asfactor()
    print(frame.head(5))
    model = H2ODeepWaterEstimator(epochs=50,
                                  learning_rate=1e-5,
                                  stopping_rounds=0,
                                  score_duty_cycle=1,
                                  train_samples_per_iteration=-1,
                                  score_interval=0)
    model.train(y=1, training_frame=frame)

    ## save the model
    model_path = h2o.save_model(model)

    ## delete everything - simulate cluster shutdown and restart
    h2o.remove_all()

    ## reimport the model and the frame
    model = h2o.load_model(model_path)
    #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv"))
    frame = h2o.import_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    frame.drop(0)
    frame[1] = frame[1].asfactor()

    ## delete the checkpoint file
    os.remove(model_path)

    ## continue training
    model2 = H2ODeepWaterEstimator(epochs=100,
                                   learning_rate=1e-5,
                                   stopping_rounds=0,
                                   score_duty_cycle=1,
                                   train_samples_per_iteration=-1,
                                   score_interval=0,
                                   checkpoint=model.model_id)
    model2.train(y=1, training_frame=frame)
    model2.show()
Esempio n. 44
0
def isolation_forest_save_and_load():
    print("Isolation Forest Smoke Test")

    train = h2o.import_file(pyunit_utils.locate("smalldata/anomaly/ecg_discord_train.csv"))

    if_model = H2OIsolationForestEstimator(ntrees=7, seed=12, sample_size=5)
    if_model.train(training_frame=train)

    path = pyunit_utils.locate("results")

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(if_model, path=path, force=True)

    assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
    reloaded = h2o.load_model(model_path)

    assert isinstance(reloaded, H2OIsolationForestEstimator), "Expected and H2OIsolationForestEstimator, but got {0}"\
        .format(reloaded)
Esempio n. 45
0
def download_model():
    prostate = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    prostate["CAPSULE"] = prostate["CAPSULE"].asfactor()

    prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=8,
                                                min_rows=10, learn_rate=0.2)
    prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"],
                       y="CAPSULE", training_frame=prostate)
    
    path = pyunit_utils.locate("results")

    downloaded_model_path = prostate_gbm.download_model(path=path)
    assert os.path.isfile(downloaded_model_path), \
        "Expected load file {0} to exist, but it does not.".format(downloaded_model_path)
    
    loaded_model = h2o.load_model(downloaded_model_path)
    assert isinstance(loaded_model, H2OGradientBoostingEstimator), \
        "Expected an H2OGradientBoostingEstimator, but got {0}".format(downloaded_model_path)
Esempio n. 46
0
def test_hadoop():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    print("Importing hdfs data")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv")
    print("Spliting data")
    train,test = h2o_data.split_frame(ratios=[0.1])
    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    print("Training")
    h2o_glm.train(x=range(1, 10), y=0, training_frame=train) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    print("Saving model")
    new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path)
    print("Loading back model")
    new_model = h2o.load_model(new_model_path)
  def __init__(self, model_name, model_base_path):
    """
    Initialize the service.
        
    Args:
      model_name: The name of the model.
      model_base_path: The file path of the model.
    Return:
      None
    """
    super(H2oInferenceService, self).__init__()

    # Start the h2o server
    if os.path.isfile("/tmp/h2o.jar"):
      logging.info("Run to run command 'java -jar /tmp/h2o.jar'")
      subprocess.Popen(["java", "-jar", "/tmp/h2o.jar"])

      logging.info("Sleep 10s to wait for h2o server")
      time.sleep(10)

    local_model_base_path = filesystem_util.download_hdfs_moels(
        model_base_path)

    self.model_name = model_name
    self.model_base_path = local_model_base_path
    self.model_version_list = [1]
    self.model_graph_signature = ""
    self.platform = "H2o"

    self.preprocess_function, self.postprocess_function = preprocess_util.get_preprocess_postprocess_function_from_model_path(
        self.model_base_path)

    import h2o

    logger.info("Try to initialize and connect the h2o server")
    h2o.init()

    logger.info("Try to load the h2o model")
    model = h2o.load_model(self.model_base_path)

    self.model = model
    # TODO: Update the signature with readable string
    self.model_graph_signature = "{}".format(self.model.full_parameters)
def milsong_checkpoint():

    milsong_train = h2o.upload_file(tests.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(tests.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))

    # build first model
    ntrees1 = random.sample(range(50,100),1)[0]
    max_depth1 = random.sample(range(2,6),1)[0]
    min_rows1 = random.sample(range(10,16),1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234)

    # save the model, then load the model
    path = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","results"))

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(model1, path=path, force=True)

    assert os.path.isdir(model_path), "Expected load directory {0} to exist, but it does not.".format(model_path)
    restored_model = h2o.load_model(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                               checkpoint=restored_model._id,seed=1234)

    # build the equivalent of model 2 in one shot
    model3 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234)

    assert isinstance(model2,type(model3))
    assert model2.mse(valid=True)==model3.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model3.mse(valid=True))
def glrm_mojo():
    h2o.remove_all()
    NTESTROWS = 200    # number of test dataset rows
    df = pyunit_utils.random_dataset("regression", seed=1234)       # generate random dataset
    train = df[NTESTROWS:, :]
    test = df[:NTESTROWS, :]
    x = df.names

    transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"]
    transformN = transform_types[randint(0, len(transform_types)-1)]

    # build a GLRM model with random dataset generated earlier
    glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234)
    glrmModel.train(x=x, training_frame=train)
    glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name'])

    assert glrmTrainFactor.nrows==train.nrows, \
        "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows)
    save_GLRM_mojo(glrmModel) # ave mojo model

    MOJONAME = pyunit_utils.getMojoName(glrmModel._id)
    TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME))
    h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv'))  # save test file, h2o predict/mojo use same file
    pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict

    h2o.save_model(glrmModel, TMPDIR)   # save GLRM model
    glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME))
    predict_model = glrmModel2.predict(test)
    for col in range(pred_h2o.ncols):
        if pred_h2o[col].isfactor():
            pred_h2o[col] = pred_h2o[col].asnumeric()
            predict_model[col] = predict_model[col].asnumeric()
    print("Comparing mojo predict and h2o predict...")
    pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
    print("Comparing mojo predict and h2o predict from saved model...")
    pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10)
    frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor
    glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID)   # store the x Factor for new test dataset
    print("Comparing mojo x Factor and model x Factor ...")
    pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def milsong_checkpoint():

    milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))
    distribution = "gaussian"

    # build first model
    ntrees1 = random.sample(list(range(50,100)),1)[0]
    max_depth1 = random.sample(list(range(2,6)),1)[0]
    min_rows1 = random.sample(list(range(10,16)),1)[0]
    print("ntrees model 1: {0}".format(ntrees1))
    print("max_depth model 1: {0}".format(max_depth1))
    print("min_rows model 1: {0}".format(min_rows1))
    model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])

    # save the model, then load the model
    path = pyunit_utils.locate("results")

    assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path)
    model_path = h2o.save_model(model1, path=path, force=True)

    assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path)
    restored_model = h2o.load_model(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print("ntrees model 2: {0}".format(ntrees2))
    print("max_depth model 2: {0}".format(max_depth2))
    print("min_rows model 2: {0}".format(min_rows2))
    model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                     checkpoint=restored_model.model_id)

    # build the equivalent of model 2 in one shot
    model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                     distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
    def test_saved_binary_model_produces_same_predictions_as_original():
        ds = prepare_data(blending)
        base_models = train_base_models(ds)
        se_model = train_stacked_ensemble(ds, base_models)
        
        #Predict in ensemble in Py client
        preds_py = se_model.predict(ds.test)
        
        tmp_dir = tempfile.mkdtemp()
        try:
            bin_file = h2o.save_model(se_model, tmp_dir)
            #Load binary model and predict
            bin_model = h2o.load_model(pu.locate(bin_file))
            preds_bin = bin_model.predict(ds.test)
        finally:
            shutil.rmtree(tmp_dir)

        #Predictions from model in Py and binary model should be the same
        pred_diff = preds_bin - preds_py
        assert pred_diff["p0"].max() < 1e-11
        assert pred_diff["p1"].max() < 1e-11
        assert pred_diff["p0"].min() > -1e-11
        assert pred_diff["p1"].min() > -1e-11
def milsong_checkpoint(ip,port):

    milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz"))
    milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz"))

    # build first model
    ntrees1 = random.sample(range(50,100),1)[0]
    max_depth1 = random.sample(range(2,6),1)[0]
    min_rows1 = random.sample(range(10,16),1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234)

    # save the model, then load the model
    model_path = h2o.save_model(model1,force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree(model_path)

    # continue building the model
    ntrees2 = ntrees1 + 50
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],
                               checkpoint=restored_model._id,seed=1234)

    # build the equivalent of model 2 in one shot
    model3 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,
                               validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234)

    assert isinstance(model2,type(model3))
    assert model2.mse(valid=True)==model3.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model3.mse(valid=True))
Esempio n. 53
0
def h2oapi():
    """
    Python API test: h2o.load_model(path)
    """
    training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
    Y = 3
    X = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10]

    model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5)
    model.train(x=X, y=Y, training_frame=training_data)
    try:
        results_dir = pyunit_utils.locate("results")    # find directory path to results folder
        h2o.save_model(model, path=results_dir, force=True)       # save model
        full_path_filename = os.path.join(results_dir, model._id)
        assert os.path.isfile(full_path_filename), "h2o.save_model() command is not working."
        model_reloaded = h2o.load_model(full_path_filename)
        assert_is_type(model, H2OGeneralizedLinearEstimator)
        assert_is_type(model_reloaded, H2OGeneralizedLinearEstimator)

    except Exception as e:
        if 'File not found' in e.args[0]:
            print("Directory is not writable.  h2o.load_model() command is not tested.")
        else:
            assert False, "h2o.load_model() command is not working."
Esempio n. 54
0
features = list(range(0,784))
target = 784

train[target] = train[target].asfactor()
valid[target] = valid[target].asfactor()

# Build model
model = H2ODeepWaterEstimator(epochs=20, activation="Rectifier", hidden=[200,200], ignore_const_cols=False, mini_batch_size=256, input_dropout_ratio=0.1, hidden_dropout_ratios=[0.5,0.5], stopping_rounds=3, stopping_tolerance=0.05, stopping_metric="misclassification", score_interval=2, score_duty_cycle=0.5, score_training_samples=1000, score_validation_samples=1000, gpu=True, seed=1234)

model.train(x=features, y=target, training_frame=train, validation_frame=valid)

# Evaluate model
model.show()
print(model.scoring_history())

# Checkpoint model
model_path = h2o.save_model(model=model, force=True)

# Load model
model_ckpt = h2o.load_model(model_path)

# Start training from checkpoint
model_warm = H2ODeepWaterEstimator(checkpoint=model_ckpt.model_id, epochs=100, activation="Rectifier", hidden=[200,200], ignore_const_cols=False, mini_batch_size=256, input_dropout_ratio=0.1, hidden_dropout_ratios=[0.5,0.5], stopping_rounds=3, stopping_tolerance=0.05, stopping_metric="misclassification", score_interval=2, score_duty_cycle=0.5, score_training_samples=1000, score_validation_samples=1000, gpu=True, seed=1234)

model_warm.train(x=features, y=target, training_frame=train, validation_frame=valid)

# Evaluate checkpointed model
model_warm.show()
print(model_warm.scoring_history())
Esempio n. 55
0
def cars_checkpoint(ip,port):

    cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars.runif()
    train = cars[s > .2]
    valid = cars[s <= .2]

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(range(3),1)[0]

    # pick the predictors and response column
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        cars[response_col] = cars[response_col].asfactor()
    elif problem == 2 :
        response_col = "cylinders"
        cars[response_col] = cars[response_col].asfactor()
    else              :
        response_col = "economy"

    print "Response column: {0}".format(response_col)

    # build first model
    ntrees1 = random.sample(range(5,21),1)[0]
    max_depth1 = random.sample(range(2,6),1)[0]
    min_rows1 = random.sample(range(10,16),1)[0]
    print "ntrees model 1: {0}".format(ntrees1)
    print "max_depth model 1: {0}".format(max_depth1)
    print "min_rows model 1: {0}".format(min_rows1)
    model1 = h2o.random_forest(x=train[predictors],y=train[response_col],ntrees=ntrees1,max_depth=max_depth1,
                               min_rows=min_rows1, validation_x=valid[predictors],validation_y=valid[response_col],
                               seed=2345)

    # save the model, remove all keys, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)

    # continue building the model with the same max_depth and min_rows
    ntrees2 = ntrees1 + random.sample(range(5,21),1)[0]
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print "ntrees model 2: {0}".format(ntrees2)
    print "max_depth model 2: {0}".format(max_depth2)
    print "min_rows model 2: {0}".format(min_rows2)
    model2 = h2o.random_forest(x=train[predictors],y=train[response_col],ntrees=ntrees2,max_depth=max_depth2,
                               min_rows=min_rows2,checkpoint=restored_model._id,validation_x=valid[predictors],
                               validation_y=valid[response_col],seed=2345)

    # continue building the model, but with different max_depth and min_rows (ensemble)
    ntrees3 = ntrees2
    max_depth3 = max_depth2 + random.sample(range(3,6),1)[0]
    min_rows3 = min_rows2 + random.sample(range(5,10),1)[0]
    print "ntrees model 3: {0}".format(ntrees3)
    print "max_depth model 3: {0}".format(max_depth3)
    print "min_rows model 3: {0}".format(min_rows3)
    model3 = h2o.random_forest(x=train[predictors],y=train[response_col],ntrees=ntrees3,max_depth=max_depth3,
                               min_rows=min_rows3,checkpoint=restored_model._id,validation_x=valid[predictors],
                               validation_y=valid[response_col],seed=2345)

    # build the equivalent of model 2 in one shot
    model4 = h2o.random_forest(x=train[predictors],y=train[response_col],ntrees=ntrees2,max_depth=max_depth2,
                               min_rows=min_rows2,validation_frame=valid,validation_x=valid[predictors],
                               validation_y=valid[response_col],seed=2345)

    if problem == 0:
        assert isinstance(model2,type(model4))
        assert model2.mse(valid=True)==model4.mse(valid=True)
        assert model3.mse(valid=True)!=model4.mse(valid=True)
    elif problem == 1:
        assert isinstance(model2,type(model4))
        assert model2.auc(valid=True)==model4.auc(valid=True)
        assert model3.auc(valid=True)!=model4.auc(valid=True)
        assert model2.logloss(valid=True)==model4.logloss(valid=True)
        assert model3.logloss(valid=True)!=model4.logloss(valid=True)
        assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True)
        assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True)
    else:
        assert isinstance(model2,type(model4))
        assert model2.mse(valid=True)==model4.mse(valid=True)
        assert model3.mse(valid=True)!=model4.mse(valid=True)
        assert model2.r2(valid=True)==model4.r2(valid=True)
        assert model3.r2(valid=True)!=model4.r2(valid=True)
Esempio n. 56
0
def evalmodel(df):
	glm_classifier = h2o.load_model('./model')
	result = h2o.as_list(glm_classifier.predict(df), use_pandas = False)
	result.pop(0) #get rid of the column header
	result = [float(r[0]) for r in result] #the results are each returned as 1-element lists. fix that. 
	return result
Esempio n. 57
0
    def load(location):
        """Loads a persisted state of an instance of H2OPipeline
        from disk. This method will handle loading H2OEstimator models separately 
        and outside of the constraints of the pickle package. 

        Note that this is a static method and should be called accordingly:

            >>> def load_pipe():
            ...     return H2OPipeline.load('path/to/h2o/pipeline.pkl') # GOOD!
            >>>
            >>> pipe = load_pipe() # doctest: +SKIP

        Also note that since H2OPipeline can contain an H2OEstimator, it's
        ``load`` functionality differs from that of its superclass, BaseH2OFunctionWrapper
        and will not function properly if called at the highest level of abstraction:

            >>> def load_pipe():
            ...     return BaseH2OFunctionWrapper.load('path/to/h2o/pipeline.pkl') # BAD!
            >>>
            >>> pipe = load_pipe() # doctest: +SKIP

        Furthermore, trying to load a different type of BaseH2OFunctionWrapper from
        this method will raise a TypeError:

            >>> def load_pipe():
            ...     return H2OPipeline.load('path/to/some/other/transformer.pkl') # BAD!
            >>>
            >>> pipe = load_pipe() # doctest: +SKIP

        Parameters
        ----------

        location : str
            The location where the persisted H2OPipeline model resides.

        Returns
        -------

        model : H2OPipeline
            The unpickled instance of the H2OPipeline model
        """
        with open(location, 'rb') as f:
            model = pickle.load(f)

        if not isinstance(model, H2OPipeline):
            raise TypeError('expected H2OPipeline, got %s' % type(model))

        # if the pipe didn't end in an h2o estimator, we don't need to
        # do the following IO segment...
        ends_in_h2o = hasattr(model, 'model_loc_')
        if ends_in_h2o:
            # read the model portion, delete the model path
            ex = None
            for pth in [model.model_loc_, 'hdfs://%s' % model.model_loc_]:
                try:
                    the_h2o_model = h2o.load_model(pth)
                except Exception as e:
                    if ex is None:
                        ex = e
                    else:
                        # only throws if fails twice
                        raise ex

            model.steps[-1] = (model.est_name_, the_h2o_model)

        return model
Esempio n. 58
0
    def load(location):
        """Loads a persisted state of an instance of BaseH2OSearchCV
        from disk. This method will handle loading H2OEstimator models separately 
        and outside of the constraints of the pickle package. 

        Note that this is a static method and should be called accordingly:

            >>> def load_search():
            ...     return BaseH2OSearchCV.load('path/to/h2o/search.pkl') # GOOD!
            >>>
            >>> search = load_search() # doctest: +SKIP

        Also note that since BaseH2OSearchCV will contain an H2OEstimator, it's
        ``load`` functionality differs from that of its superclass, BaseH2OFunctionWrapper
        and will not function properly if called at the highest level of abstraction:

            >>> def load_search():
            ...     return BaseH2OFunctionWrapper.load('path/to/h2o/search.pkl') # BAD!
            >>>
            >>> search = load_search() # doctest: +SKIP

        Furthermore, trying to load a different type of BaseH2OFunctionWrapper from
        this method will raise a TypeError:

            >>> def load_search():
            ...     return BaseH2OSearchCV.load('path/to/some/other/transformer.pkl') # BAD!
            >>>
            >>> search = load_search() # doctest: +SKIP

        Parameters
        ----------

        location : str
            The location where the persisted BaseH2OSearchCV model resides.

        Returns
        -------

        model : BaseH2OSearchCV
            The unpickled instance of the BaseH2OSearchCV model
        """
        with open(location, 'rb') as f:
            model = pickle.load(f)

        if not isinstance(model, BaseH2OSearchCV):
            raise TypeError('expected BaseH2OSearchCV, got %s' % type(model))

        # read the model portion, delete the model path
        ex = None
        the_h2o_est = None
        for pth in [model.model_loc_, 'hdfs://%s' % model.model_loc_]:
            try:
                the_h2o_est = h2o.load_model(pth)
            except Exception as e:
                if ex is None:
                    ex = e
                else:
                    # only throws if fails twice
                    raise ex

                    # break if successfully loaded
            if the_h2o_est is not None:
                break

        # if self.estimator is None, then it's simply the H2OEstimator,
        # otherwise it's going to be the H2OPipeline
        if model.best_estimator_ is None:
            model.best_estimator_ = the_h2o_est
            model.estimator = _new_base_estimator(model.est_type_, model.base_estimator_parms_)
        else:
            model.best_estimator_.steps[-1] = (model.est_name_, the_h2o_est)
            model.estimator.steps[-1] = (
                model.est_name_, _new_base_estimator(model.est_type_, model.base_estimator_parms_))

        return model
def cars_checkpoint():

    cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    s = cars.runif()
    train = cars[s > .2]
    valid = cars[s <= .2]

    print("\n*** Description (chunk distribution, etc) of training frame:")
    train.describe()
    print("\n*** Description (chunk distribution, etc) of validation frame:")
    valid.describe()

    # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial,
    # 2:multinomial
    problem = random.sample(list(range(3)),1)[0]

    # pick the predictors and response column, along with the correct distribution
    predictors = ["displacement","power","weight","acceleration","year"]
    if problem == 1   :
        response_col = "economy_20mpg"
        distribution = "bernoulli"
        train[response_col] = train[response_col].asfactor()
        valid[response_col] = valid[response_col].asfactor()
    elif problem == 2 :
        response_col = "cylinders"
        distribution = "multinomial"
        train[response_col] = train[response_col].asfactor()
        valid[response_col] = valid[response_col].asfactor()
    else              :
        response_col = "economy"
        distribution = "gaussian"

    print("\n*** Distribution: {0}".format(distribution))
    print("\n*** Response column: {0}".format(response_col))

    # build first model
    ntrees1 = 5
    max_depth1 = random.sample(list(range(2,6)),1)[0]
    min_rows1 = random.sample(list(range(10,16)),1)[0]
    print("\n*** Building model 1 with the following parameters:")
    print("*** ntrees model 1: {0}".format(ntrees1))
    print("*** max_depth model 1: {0}".format(max_depth1))
    print("*** min_rows model 1: {0}".format(min_rows1))

    from h2o.estimators.gbm import H2OGradientBoostingEstimator
    model1 = H2OGradientBoostingEstimator(ntrees=ntrees1,
                                          max_depth=max_depth1,
                                          min_rows=min_rows1,
                                          score_each_iteration=True,
                                          distribution=distribution)
    model1.train(x=predictors, y=response_col,training_frame=train,validation_frame=valid)

    # model1 = h2o.gbm(x=train[predictors],
    #                  y=train[response_col],
    #                  ntrees=ntrees1,
    #                  max_depth=max_depth1,
    #                  min_rows=min_rows1,
    #                  score_each_iteration=True,
    #                  distribution=distribution,
    #                  validation_x=valid[predictors],
    #                  validation_y=valid[response_col])

    # save the model, then load the model
    model_path = h2o.save_model(model1, name="delete_model", force=True)
    restored_model = h2o.load_model(model_path)
    shutil.rmtree("delete_model")

    # continue building the model
    ntrees2 = ntrees1 + 5
    max_depth2 = max_depth1
    min_rows2 = min_rows1
    print("\n*** Continuing to build model 1 (now called model 2) with the following parameters:")
    print("*** ntrees model 2: {0}".format(ntrees2))
    print("*** max_depth model 2: {0}".format(max_depth2))
    print("*** min_rows model 2: {0}".format(min_rows2))

    model2 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                          max_depth=max_depth2,
                                          min_rows=min_rows2,
                                          distribution=distribution,
                                          score_each_iteration=True,
                                          checkpoint=restored_model._id)
    model2.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)

    # model2 = h2o.gbm(x=train[predictors],
    #                  y=train[response_col],
    #                  ntrees=ntrees2,
    #                  max_depth=max_depth2,
    #                  min_rows=min_rows2,
    #                  distribution=distribution,
    #                  score_each_iteration=True,
    #                  validation_x=valid[predictors],
    #                  validation_y=valid[response_col],
    #                  checkpoint=restored_model._id)

    # continue building the model, but with different number of trees
    ntrees3 = ntrees2 + 50
    max_depth3 = max_depth1
    min_rows3 = min_rows1
    print("\n*** Continuing to build model 1 (now called model 3) with the following parameters:")
    print("*** ntrees model 3: {0}".format(ntrees3))
    print("*** max_depth model 3: {0}".format(max_depth3))
    print("*** min_rows model 3: {0}".format(min_rows3))

    model3 = H2OGradientBoostingEstimator(ntrees=ntrees3,
                                          max_depth=max_depth3,
                                          min_rows=min_rows3,
                                          distribution=distribution,
                                          score_each_iteration=True,
                                          checkpoint=restored_model._id)
    model3.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)

    # model3 = h2o.gbm(x=train[predictors],
    #                  y=train[response_col],
    #                  ntrees=ntrees3,
    #                  max_depth=max_depth3,
    #                  min_rows=min_rows3,
    #                  distribution=distribution,
    #                  score_each_iteration=True,
    #                  validation_x=valid[predictors],
    #                  validation_y=valid[response_col],
    #                  checkpoint=restored_model._id)

    # build the equivalent of model 2 in one shot
    print("\n*** Building the equivalent of model 2 (called model 4) in one shot:")

    model4 = H2OGradientBoostingEstimator(ntrees=ntrees2,
                                          max_depth=max_depth2,
                                          min_rows=min_rows2,
                                          distribution=distribution,
                                          score_each_iteration=True)
    model4.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid)

    # model4 = h2o.gbm(x=train[predictors],
    #                  y=train[response_col],
    #                  ntrees=ntrees2,
    #                  max_depth=max_depth2,
    #                  min_rows=min_rows2,
    #                  distribution=distribution,
    #                  score_each_iteration=True,
    #                  validation_x=valid[predictors],
    #                  validation_y=valid[response_col])

    print("\n*** Model Summary for model 2:")
    print(model2.summary())
    print("\n*** Model Summary for model 3:")
    print(model3.summary())
    print("\n*** Model Summary for model 4:")
    print(model4.summary())

    print("\n*** Score History for model 2:")
    print(model2.scoring_history())
    print("\n*** Score History for model 3:")
    print(model3.scoring_history())
    print("\n*** Score History for model 4:")
    print(model4.scoring_history())

    # checks
    if problem == 0:
        assert isinstance(model2,type(model4))
        assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True))
        #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True))

    elif problem == 1:
        assert isinstance(model2,type(model4))
        assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True))
        #assert model3.auc(valid=True)!=model4.auc(valid=True), "Expected Model 3 AUC: {0} to be different from Model 4 AUC: {1}".format(model3.auc(valid=True), model4.auc(valid=True))

        assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))
        #assert model3.logloss(valid=True)!=model4.logloss(valid=True), "Expected Model 3 Log Loss: {0} to be different from Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True))

        assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))
        #assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True), "Expected Model 3 Gini Coef: {0} to be different from Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True))

    else:
        assert isinstance(model2,type(model4))
        assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True))
        #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True))

        assert model2.r2(valid=True)==model4.r2(valid=True), "Expected Model 2 R2: {0} to be the same as Model 4 R2: {1}".format(model2.r2(valid=True), model4.r2(valid=True))
Esempio n. 60
0
import h2o
h2o.init()

md = h2o.load_model("1-train+model/test_airline_GBM_100k")

dx_test = h2o.import_file("https://s3.amazonaws.com/benchm-ml--main/test.csv")
dx_test1 = dx_test[0,0:8]
dx_test1.as_data_frame()


%time md.predict(dx_test1).as_data_frame()    ## h2o frame
## Wall time: 120 ms


dpy_test1 = dx_test1.as_data_frame()   ## py object (pandas)

%time md.predict(h2o.H2OFrame.from_python(dpy_test1, column_names = ["Month","DayofMonth","DayOfWeek","DepTime","UniqueCarrier","Origin","Dest","Distance"])).as_data_frame()
## Wall time: 255 ms


# %time dx_test1_2 = h2o.H2OFrame.from_python(dpy_test1, column_names = ["Month","DayofMonth","DayOfWeek","DepTime","UniqueCarrier","Origin","Dest","Distance"])
# ## Wall time: 134 ms
# dx_test1_2.as_data_frame()
# %time md.predict(dx_test1_2).as_data_frame()
# ## Wall time: 121 ms