Esempio n. 1
0
def mojo_predict_pandas_test(sandbox_dir):
    data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    input_csv = "%s/in.csv" % sandbox_dir
    pdf = data[1, 2:]
    h2o.export_file(pdf, input_csv)

    data[1] = data[1].asfactor()
    model = H2OGradientBoostingEstimator(distribution="bernoulli")
    model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data)

    h2o_prediction = model.predict(pdf)

    # download mojo
    model_zip_path = os.path.join(sandbox_dir, 'model.zip')
    genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar')
    download_mojo(model, model_zip_path)
    assert os.path.isfile(model_zip_path)
    assert os.path.isfile(genmodel_path)

    pandas_frame = pandas.read_csv(input_csv)
    mojo_prediction = h2o.mojo_predict_pandas(dataframe=pandas_frame, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path)
    print("Binomial Prediction (Binary) - p0: %f" % h2o_prediction[0,1])
    print("Binomial Prediction (Binary) - p1: %f" % h2o_prediction[0,2])
    print("Binomial Prediction (MOJO) - p0: %f" % mojo_prediction['0'].iloc[0])
    print("Binomial Prediction (MOJO) - p1: %f" % mojo_prediction['1'].iloc[0])
    assert h2o_prediction[0,1] == mojo_prediction['0'].iloc[0], "expected predictions to be the same for binary and MOJO model - p0"
    assert h2o_prediction[0,2] == mojo_prediction['1'].iloc[0], "expected predictions to be the same for binary and MOJO model - p0"
Esempio n. 2
0
def mojo_predict_pandas_test(sandbox_dir):
    data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    input_csv = "%s/in.csv" % sandbox_dir
    pdf = data[1, 2:]
    h2o.export_file(pdf, input_csv)

    data[1] = data[1].asfactor()
    model = H2OGradientBoostingEstimator(distribution="bernoulli")
    model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data)

    h2o_prediction = model.predict(pdf)

    # download mojo
    model_zip_path = os.path.join(sandbox_dir, 'model.zip')
    genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar')
    download_mojo(model, model_zip_path)
    assert os.path.isfile(model_zip_path)
    assert os.path.isfile(genmodel_path)

    pandas_frame = pandas.read_csv(input_csv)
    mojo_prediction = h2o_utils.mojo_predict_pandas(dataframe=pandas_frame, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path)
    print("Binomial Prediction (Binary) - p0: %f" % h2o_prediction[0,1])
    print("Binomial Prediction (Binary) - p1: %f" % h2o_prediction[0,2])
    print("Binomial Prediction (MOJO) - p0: %f" % mojo_prediction['0'].iloc[0])
    print("Binomial Prediction (MOJO) - p1: %f" % mojo_prediction['1'].iloc[0])
    assert h2o_prediction[0,1] == mojo_prediction['0'].iloc[0], "expected predictions to be the same for binary and MOJO model - p0"
    assert h2o_prediction[0,2] == mojo_prediction['1'].iloc[0], "expected predictions to be the same for binary and MOJO model - p0"
Esempio n. 3
0
def test_hdfs_io():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    print("Importing hdfs data")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/airlines/airlines_all.05p.csv")

    print("Spliting data")
    for c in ["Month","DayofMonth","IsArrDelayed"]:
        h2o_data[c] = h2o_data[c].asfactor()
    myX = ["Month","DayofMonth","Distance"]
    train,test = h2o_data.split_frame(ratios=[0.9])

    print("Exporting file to hdfs")
    h2o.export_file(test[:,["Year","DayOfWeek"]], "hdfs://" + hdfs_name_node + "/datasets/exported.csv")

    print("Reading file back in and comparing if data is the same")
    new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv")
    assert((test[:,"DayOfWeek"] - new_test[:,"DayOfWeek"]).sum() == 0)

    print("Training")
    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    h2o_glm.train(x=myX, y="IsArrDelayed", training_frame=train) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    print("Saving model")
    new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path)
    print("Loading back model")
    new_model = h2o.load_model(new_model_path)
    print("Running predictions")
    preds = new_model.predict(test)
Esempio n. 4
0
def export_file():
    pros_hex = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    pros_hex[1] = pros_hex[1].asfactor()
    pros_hex[3] = pros_hex[3].asfactor()
    pros_hex[4] = pros_hex[4].asfactor()
    pros_hex[5] = pros_hex[5].asfactor()
    pros_hex[8] = pros_hex[8].asfactor()

    p_sid = pros_hex.runif()
    pros_train = pros_hex[p_sid > 0.2, :]
    pros_test = pros_hex[p_sid <= 0.2, :]

    glm = H2OGeneralizedLinearEstimator(family="binomial")
    myglm = glm.train(x=list(range(2, pros_hex.ncol)), y=1, training_frame=pros_train)
    mypred = glm.predict(pros_test)

    def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
        return "".join(random.choice(chars) for _ in range(size))

    fname = id_generator() + "_prediction.csv"

    path = pyunit_utils.locate("results")
    dname = path + "/" + fname

    h2o.export_file(mypred, dname)

    py_pred = pd.read_csv(dname)
    print(py_pred.head())
    h_pred = mypred.as_data_frame(True)
    print(h_pred.head())

    # Test to check if py_pred & h_pred are identical
    assert_frame_equal(py_pred, h_pred)
Esempio n. 5
0
def h2oexport_file():
    """
    Python API test: h2o.export_file(frame, path, force=False, parts=1).  Note taht force=True is only honored if
    parts=1.  Otherwise, an error will be thrown.
    """
    training_data = h2o.import_file(
        pyunit_utils.locate("smalldata/logreg/benign.csv"))
    try:
        results_dir = pyunit_utils.locate(
            "results")  # find directory path to results folder
        final_path = os.path.join(results_dir, 'frameData')
        h2o.export_file(training_data, final_path, force=True,
                        parts=1)  # save data
        assert os.path.isfile(
            final_path), "h2o.export_file() command is not working."
        final_dir_path = os.path.join(results_dir, 'multiFrame')
        h2o.export_file(training_data, final_dir_path, force=True, parts=-1)
        assert len(os.listdir(
            final_dir_path)) > 0, "h2o.export_file() command is not working."
    except Exception as e:
        if e.__class__.__name__ == 'ValueError' and 'File not found' in e.args[
                0]:
            print(
                "Directory is not writable.  h2o.export_file() command is not tested."
            )
        else:
            assert e.__class__.__name__=='H2OResponseError' and \
                   'exportFrame: Cannot use path' in e.args[0]._props['dev_msg'], \
                "h2o.export_file() command is not working."
            print(
                "Directory: {0} is not empty.  Delete or empy it before re-run.  h2o.export_file() "
                "is not tested with multi-part export.".format(final_dir_path))
def export_file_multipart():
    pros_hex = h2o.upload_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    pros_hex[1] = pros_hex[1].asfactor()
    pros_hex[3] = pros_hex[3].asfactor()
    pros_hex[4] = pros_hex[4].asfactor()
    pros_hex[5] = pros_hex[5].asfactor()
    pros_hex[8] = pros_hex[8].asfactor()

    p_sid = pros_hex.runif()
    pros_train = pros_hex[p_sid > .2, :]
    pros_test = pros_hex[p_sid <= .2, :]

    glm = H2OGeneralizedLinearEstimator(family="binomial")
    myglm = glm.train(x=list(range(2, pros_hex.ncol)), y=1, training_frame=pros_train)
    mypred = glm.predict(pros_test)

    def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
        return ''.join(random.choice(chars) for _ in range(size))

    path = pyunit_utils.locate("results")
    dname = os.path.join(path, id_generator() + "_prediction")

    h2o.export_file(mypred, dname, parts=-1)

    assert os.path.isdir(dname)

    part_files = glob.glob(os.path.join(dname, "part-m-?????"))
    print(part_files)
    py_pred = pd.concat((pd.read_csv(f) for f in part_files))
    print(py_pred.head())
    h_pred = mypred.as_data_frame(True)
    print(h_pred.head())

    #Test to check if py_pred & h_pred are identical
    assert_frame_equal(py_pred,h_pred)
def s3_import_export():
    local_frame = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    for scheme in ["s3a"]:  # s3n is deprecated since HDP3/CDH6
        timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S.%f")
        unique_suffix = str(uuid.uuid4())
        s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                  timestamp + "." + unique_suffix + ".csv.zip"
        h2o.export_file(local_frame, s3_path)

        s3 = boto3.resource('s3')
        client = boto3.client('s3')
        # S3 might have a delay in indexing the file (usually milliseconds or hundreds of milliseconds)
        # Wait for the file to be available, if not available in the biginning, try every 2 seconds, up to 10 times
        client.get_waiter('object_exists').wait(Bucket='test.0xdata.com',
                                                Key="h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                                                    timestamp + "." + unique_suffix + ".csv.zip",
                                                WaiterConfig={
                                                    'Delay': 2,
                                                    'MaxAttempts': 10
                                                })
        s3_frame = h2o.import_file(s3_path)
        assert_frame_equal(local_frame.as_data_frame(),
                           s3_frame.as_data_frame())

        s3.Object(bucket_name='test.0xdata.com', key="h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                                                     timestamp + "." + unique_suffix + ".csv.zip").delete()
Esempio n. 8
0
def local_and_hdfs_frame_equality():
    local_frame = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    hdfs_path = 'hdfs:///user/jenkins/tests/prostate_export'
    h2o.export_file(local_frame, hdfs_path, force=True)
    hdfs_frame = h2o.import_file(hdfs_path)
    assert_frame_equal(local_frame.as_data_frame(), hdfs_frame.as_data_frame())
Esempio n. 9
0
    def save_all_frames(self, path, overwrite=False):
        """Save all models to a directory.

        :param path: String path, where to save your models.
        :param overwrite: boolean, overwrite the frame
        """
        models = []
        for f in h2o.ls()['key']:
            if 'modelmetrics' not in f:
                try:
                    fh = h2o.get_frame(f)
                except (h2o.exceptions.H2OResponseError,
                        h2o.exceptions.H2OServerError):
                    pass
                else:
                    try:  # quick and dirty solution for NoneType
                        fh.frame_id
                    except:
                        pass
                    else:
                        print(fh.frame_id)
                        print("Save frame " + fh.frame_id + " to " + path +
                              "/" + fh.frame_id)
                        h2o.export_file(fh,
                                        path=path + os.sep + fh.frame_id,
                                        force=overwrite)
def mojo_predict_pandas_test(sandbox_dir):
    data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/coxph_test/heart.csv"))

    input_csv = "%s/in.csv" % sandbox_dir
    output_csv = "%s/prediction.csv" % sandbox_dir
    h2o.export_file(data, input_csv)

    data['transplant'] = data['transplant'].asfactor()
    model = H2OCoxProportionalHazardsEstimator(stratify_by=["transplant"],
                                               start_column="start",
                                               stop_column="stop")
    model.train(x=["age", "surgery", "transplant"],
                y="event",
                training_frame=data)

    h2o_prediction = model.predict(data)

    # download mojo
    model_zip_path = os.path.join(sandbox_dir, 'model.zip')
    genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar')
    download_mojo(model, model_zip_path)
    assert os.path.isfile(model_zip_path)
    assert os.path.isfile(genmodel_path)

    pandas_frame = pandas.read_csv(input_csv)
    mojo_prediction = h2o.mojo_predict_pandas(dataframe=pandas_frame,
                                              mojo_zip_path=model_zip_path,
                                              genmodel_jar_path=genmodel_path)

    assert len(mojo_prediction) == h2o_prediction.nrow
    assert_frame_equal(h2o_prediction.as_data_frame(use_pandas=True),
                       mojo_prediction,
                       check_dtype=False)
Esempio n. 11
0
def test_hadoop():
    '''
    Test H2O read and write to hdfs
    '''
    hdfs_name_node = os.getenv("NAME_NODE")
    print("Importing hdfs data")
    h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv")

    print("Spliting data")
    train,test = h2o_data.split_frame(ratios=[0.9])

    print("Exporting file to hdfs")
    h2o.export_file(test[:,0:2], "hdfs://" + hdfs_name_node + "/datasets/exported.csv")

    print("Reading file back in and comparing if data is the same")
    new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv")
    assert((test[:,1] - new_test[:,1]).sum() == 0)

    print("Training")
    h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01)
    h2o_glm.train(x=range(1, 10), y=0, training_frame=train) # dont need to train on all features

    hdfs_model_path = os.getenv("MODEL_PATH")
    print("Saving model")
    new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path)
    print("Loading back model")
    new_model = h2o.load_model(new_model_path)
    print("Running predictions")
    preds = new_model.predict(test)
Esempio n. 12
0
def s3_import_export():
    local_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    for scheme in ["s3n", "s3a"]:
        timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S")
        s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + timestamp + ".csv.zip"
        h2o.export_file(local_frame, s3_path)
        s3_frame = h2o.import_file(s3_path)
        assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame())
def test_stacked_ensemble_is_able_to_use_imported_base_models():
    import tempfile, shutil, glob
    train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"))
    test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"))
    x = train.columns
    y = "species"
    x.remove(y)

    nfolds = 2
    gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                       fold_assignment="Modulo",
                                       keep_cross_validation_predictions=True)
    gbm.train(x=x, y=y, training_frame=train)
    drf = H2ORandomForestEstimator(nfolds=nfolds,
                                  fold_assignment="Modulo",
                                  keep_cross_validation_predictions=True)
    drf.train(x=x, y=y, training_frame=train)

    se = H2OStackedEnsembleEstimator(training_frame=train,
                                     validation_frame=test,
                                     base_models=[gbm.model_id, drf.model_id])
    se.train(x=x, y=y, training_frame=train)

    assert len(se.base_models) == 2

    TMP_DIR = tempfile.mkdtemp()
    try:
        h2o.save_model(gbm, TMP_DIR + "/gbm.model")
        h2o.save_model(drf, TMP_DIR + "/drf.model")

        gbm_holdout_id = gbm.cross_validation_holdout_predictions().frame_id
        drf_holdout_id = drf.cross_validation_holdout_predictions().frame_id
        h2o.export_file(gbm.cross_validation_holdout_predictions(), TMP_DIR + "/gbm.holdout")
        h2o.export_file(drf.cross_validation_holdout_predictions(), TMP_DIR + "/drf.holdout")

        h2o.remove_all()

        h2o.import_file(TMP_DIR + "/gbm.holdout", gbm_holdout_id)
        h2o.import_file(TMP_DIR + "/drf.holdout", drf_holdout_id)

        gbm = h2o.upload_model(glob.glob(TMP_DIR + "/gbm.model/*")[0])
        drf = h2o.upload_model(glob.glob(TMP_DIR + "/drf.model/*")[0])

        train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"), "some_other_name_of_training_frame")
        test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"), "some_other_name_of_test_frame")
        x = train.columns
        y = "species"
        x.remove(y)

        se_loaded = H2OStackedEnsembleEstimator(training_frame=train,
                                                validation_frame=test,
                                                base_models=[gbm.model_id, drf.model_id])
        se_loaded.train(x=x, y=y, training_frame=train)

        assert len(se_loaded.base_models) == 2
    finally:
        shutil.rmtree(TMP_DIR)
Esempio n. 14
0
def s3_import_export(scheme):
    local_frame = h2o.import_file("/home/0xdiag/smalldata/logreg/prostate.csv")
    timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S")
    unique_suffix = str(uuid.uuid4())
    s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
              timestamp + "." + unique_suffix + ".csv.zip"
    h2o.export_file(local_frame, s3_path)
    s3_frame = h2o.import_file(s3_path)
    assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame())
Esempio n. 15
0
 def s3_import_export(self, scheme):
     local_frame = h2o.import_file("/home/0xdiag/smalldata/logreg/prostate.csv")
     timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S")
     unique_suffix = str(uuid.uuid4())
     s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
               timestamp + "." + unique_suffix + ".csv.zip"
     h2o.export_file(local_frame, s3_path)
     s3_frame = h2o.import_file(s3_path)
     assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame())
Esempio n. 16
0
    def predict(self):

        pr_big = self.model2.predict(self.data)
        pr_big = pr_big.sort('p1', ascending=False)
        df_short_ = pr_big[:1000000, :]
        df_short_ = df_short_['msisdn']
        h2o.export_file(
            df_short_,
            'hdfs://T2-HDFS-HA-PROD/user/andrey.lukyanenko/exported.csv')
Esempio n. 17
0
def mojo_predict_api_test(sandbox_dir):
    data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    input_csv = "%s/in.csv" % sandbox_dir
    output_csv = "%s/prediction.csv" % sandbox_dir
    h2o.export_file(data[1, 2:], input_csv)

    data[1] = data[1].asfactor()
    model = H2OGradientBoostingEstimator(distribution="bernoulli")
    model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data)

    # download mojo
    model_zip_path = os.path.join(sandbox_dir, 'model.zip')
    genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar')
    download_mojo(model, model_zip_path)
    assert os.path.isfile(model_zip_path)
    assert os.path.isfile(genmodel_path)

    # test that we can predict using default paths
    h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True)
    h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path,
                               verbose=True)
    assert os.path.isfile(output_csv)
    os.remove(model_zip_path)
    os.remove(genmodel_path)
    os.remove(output_csv)

    # test that we can predict using custom genmodel path
    other_sandbox_dir = tempfile.mkdtemp()
    try:
        genmodel_path = os.path.join(other_sandbox_dir, 'h2o-genmodel-custom.jar')
        download_mojo(model, model_zip_path, genmodel_path)
        assert os.path.isfile(model_zip_path)
        assert os.path.isfile(genmodel_path)
        try:
            h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True)
            assert False, "There should be no h2o-genmodel.jar at %s" % sandbox_dir
        except RuntimeError:
            pass
        assert not os.path.isfile(output_csv)
        h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path,
                                   genmodel_jar_path=genmodel_path, verbose=True)
        assert os.path.isfile(output_csv)
        os.remove(output_csv)

        output_csv = "%s/out.prediction" % other_sandbox_dir

        # test that we can predict using default paths
        h2o.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path,
                                   genmodel_jar_path=genmodel_path, verbose=True, output_csv_path=output_csv)
        assert os.path.isfile(output_csv)
        os.remove(model_zip_path)
        os.remove(genmodel_path)
        os.remove(output_csv)
    finally:
        shutil.rmtree(other_sandbox_dir)
Esempio n. 18
0
def mojo_predict_api_test(sandbox_dir):
    data = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    input_csv = "%s/in.csv" % sandbox_dir
    output_csv = "%s/prediction.csv" % sandbox_dir
    h2o.export_file(data[1, 2:], input_csv)

    data[1] = data[1].asfactor()
    model = H2OGradientBoostingEstimator(distribution="bernoulli")
    model.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=data)

    # download mojo
    model_zip_path = os.path.join(sandbox_dir, 'model.zip')
    genmodel_path = os.path.join(sandbox_dir, 'h2o-genmodel.jar')
    download_mojo(model, model_zip_path)
    assert os.path.isfile(model_zip_path)
    assert os.path.isfile(genmodel_path)

    # test that we can predict using default paths
    h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True)
    h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, genmodel_jar_path=genmodel_path,
                               verbose=True)
    assert os.path.isfile(output_csv)
    os.remove(model_zip_path)
    os.remove(genmodel_path)
    os.remove(output_csv)

    # test that we can predict using custom genmodel path
    other_sandbox_dir = tempfile.mkdtemp()
    try:
        genmodel_path = os.path.join(other_sandbox_dir, 'h2o-genmodel-custom.jar')
        download_mojo(model, model_zip_path, genmodel_path)
        assert os.path.isfile(model_zip_path)
        assert os.path.isfile(genmodel_path)
        try:
            h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path, verbose=True)
            assert False, "There should be no h2o-genmodel.jar at %s" % sandbox_dir
        except RuntimeError:
            pass
        assert not os.path.isfile(output_csv)
        h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path,
                                   genmodel_jar_path=genmodel_path, verbose=True)
        assert os.path.isfile(output_csv)
        os.remove(output_csv)

        output_csv = "%s/out.prediction" % other_sandbox_dir

        # test that we can predict using default paths
        h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=model_zip_path,
                                   genmodel_jar_path=genmodel_path, verbose=True, output_csv_path=output_csv)
        assert os.path.isfile(output_csv)
        os.remove(model_zip_path)
        os.remove(genmodel_path)
        os.remove(output_csv)
    finally:
        shutil.rmtree(other_sandbox_dir)
Esempio n. 19
0
def s3_import_export():
    local_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    for scheme in ["s3n", "s3a"]:
        timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S")
        unique_suffix = str(uuid.uuid4())
        s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                  timestamp + "." + unique_suffix + ".csv.zip"
        h2o.export_file(local_frame, s3_path)
        s3_frame = h2o.import_file(s3_path)
        assert_frame_equal(local_frame.as_data_frame(), s3_frame.as_data_frame())
def impute_data(method = "mean", 
                to_impute = to_impute,
                predictors = predictors):
  if method == "mean":
    print "Mean imputing missing data for predictors:", to_impute
    # find mean for each time period in data for each predictor, save them in a matrix with a col for the mean values of each predictor
    # then on holdout use this table to fill in all missing values based on the time period (row) and the variable (col) of this matrix
    
    #if using python module h2o-3.1.0.3131: grouped = data.group_by(["time_period"])
    #                         gm = [grouped.mean(predictor, na="rm").get_frame() for predictor in to_impute]
    gm = d["time_period"].unique()
    print "Finding means..."
    for predictor in to_impute:
      gm = gm.cbind(d.group_by(["time_period"], {predictor:["mean", d.names().index(predictor), "rm"]}, order_by = 0))
    gm.show()
    print "Saving the imputation means to disk..."
    h2o.download_csv(gm, filename = saving_means_fp)
    # df_py = h2o.as_list(gm)
    # Now that's stored for the holdout data, do this a faster way in java for the training data:
    for predictor in to_impute:
      d.impute(predictor, method='mean', by = ['time_period'], inplace = True)
      print "Done imputing", predictor
    print "Saving the final mean imputed data to disk..."
    h2o.export_file(frame = d, path =saving_meanImputed_fp, force=True)
  
  if method == "model":
    # sequentially impute 'newdata', not 'data', so the order of the predictor variables in the loop does not matter
    # otherwise, you would be using increasingly imputed data to make predictions as the loop progresses.
    newdata = d
    # With training data, build a model for each col and predict missing data, save the models, use them on the holdout data to predict all missing data.
    for predictor in to_impute:
      print "Building model for imputing " + predictor
      print "Subsetting the data into missing values for predictor and no missing values for predictor"
      na_ind = d[predictor].isna()
      not_na_ind = na_ind != 1.0
      to_train = d[not_na_ind]
      to_predict = d[na_ind]
      these_var = [var for var in predictors if var != predictor]
      trained = h2o.gbm(x = to_train[these_var],
                        y = to_train[[predictor]],
                        ntrees=300,
                        max_depth=6,
                        learn_rate=0.2)
      print "Saving the imputation tree model for " + predictor
      h2o.save_model(trained, dir = saving_models_fp, name = "dl_imputation_model_" + predictor)
      print "Imputing the missing " +  predictor + " data by predicting with the model..."
      predicted = trained.predict(to_predict[these_var])
      tofillin = newdata[predictor]
      assert len(predicted) == len(tofillin[na_ind])
      tofillin[na_ind] = predicted # mutate the column in place
      newdata[predictor] = tofillin
    
    print "Saving the final model-imputed data to disk..."
    h2o.export_file(frame = d, path =saving_modelImputed_fp, force=True)
def s3_import_export():
    local_frame = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    for scheme in ["s3a"]:  # s3n is deprecated since HDP3/CDH6
        timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S")
        unique_suffix = str(uuid.uuid4())
        s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                  timestamp + "." + unique_suffix + ".csv.zip"
        h2o.export_file(local_frame, s3_path)
        s3_frame = h2o.import_file(s3_path)
        assert_frame_equal(local_frame.as_data_frame(),
                           s3_frame.as_data_frame())
Esempio n. 22
0
def main():
    os.environ['NO_PROXY'] = 'localhost'
    # Start H2O on your local machine
    h2o.init()
    recall = 10

    for i in range(10):
        new_recall, test_frame = model_build()
        if new_recall > recall:
            recall = new_recall
            h2o.export_file(test_frame, "/home/wso2123/My Work/Datasets/Breast cancer wisconsin/test.csv", force=True)

    print recall
Esempio n. 23
0
    def dump_results(self):
        model_ids = list(
            self.aml.leaderboard['model_id'].as_data_frame().iloc[:, 0])

        for m_id in model_ids:
            mdl = h2o.get_model(m_id)
            h2o.save_model(model=mdl, path=self.logdir, force=True)

        h2o.export_file(
            self.aml.leaderboard,
            osp.join(self.logdir, 'aml_leaderboard.h2o'),
            force=True,
        )
Esempio n. 24
0
    def test_export_orc_hdfs(self):
        fr = h2o.import_file(path="hdfs://127.0.0.1/user/jenkins/prostate_NA.orc", header=1)
        export_path = "hdfs://127.0.0.1/user/jenkins/prostate_NA_export.orc"
        failure = False
        try:
            h2o.export_file(frame=fr, path=export_path, force=True)
        except:
            failure = True
        assert not failure

        imported = h2o.import_file(path=export_path, header=1)
        assert imported.ncol == fr.ncol
        assert imported.nrow == fr.nrow
Esempio n. 25
0
    def test_export_parquet_hdfs(self):
        fr = h2o.import_file(path="hdfs://127.0.0.1/user/jenkins/airlines-simple.snappy.parquet", header=1)
        export_path = "hdfs://127.0.0.1/user/jenkins/airlines-simple.snappy_export.parquet"
        failure = False
        try:
            h2o.export_file(frame=fr, path=export_path, force=True)
        except:
            failure = True
        assert not failure

        imported = h2o.import_file(path=export_path, header=1)
        assert imported.ncol == fr.ncol
        assert imported.nrow == fr.nrow
Esempio n. 26
0
    def export_data_to_csv(cls, h2o_frame, export_path=""):
        try:
            h2o.export_file(h2o_frame, export_path)
            return export_path
        except H2OError as e:
            print("Error: {}".format(e))
            print("Trying to fall back to pandas")

        try:
            df = h2o_frame.as_data_frame(use_pandas=True, header=True)
            df.to_csv(export_path, header=True)
            return export_path
        except Exception as e:
            raise Exception("Error: {}, could not export data".format(e))
def mojo_predict_pandas_test(sandbox_dir, stratify_by=None):
    if not os.path.exists(sandbox_dir):
        os.makedirs(sandbox_dir)

    # bunch of random columns to be added to the dataset
    random_cols = ["c1", "c2", "c3", "c4"]

    data = h2o.import_file(
        pyunit_utils.locate("smalldata/coxph_test/heart.csv"))
    if stratify_by:
        for strat_col in stratify_by:
            data[strat_col] = data[strat_col].asfactor()
    data['surgery'] = data['surgery'].asfactor()

    data_random_local = pandas.DataFrame(
        np.random.random(size=(data.nrow, len(random_cols))),
        columns=random_cols)
    data = data.cbind(h2o.H2OFrame(data_random_local))

    model = H2OCoxProportionalHazardsEstimator(stratify_by=stratify_by,
                                               start_column="start",
                                               stop_column="stop",
                                               interaction_pairs=[
                                                   ("age", "c1"), ("c1", "c2"),
                                                   ("c3", "age")
                                               ])
    model.train(x=["age", "surgery", "transplant"] + random_cols,
                y="event",
                training_frame=data)
    print(model)

    # reference predictions
    h2o_prediction = model.predict(data)

    assert pyunit_utils.test_java_scoring(model, data, h2o_prediction, 1e-8)

    # download mojo
    mojo = pyunit_utils.download_mojo(model)

    # export new file (including the random columns)
    input_csv = "%s/in.csv" % sandbox_dir
    h2o.export_file(data, input_csv)
    pandas_frame = pandas.read_csv(input_csv)

    mojo_prediction = h2o.mojo_predict_pandas(dataframe=pandas_frame, **mojo)

    assert len(mojo_prediction) == h2o_prediction.nrow
    assert_frame_equal(h2o_prediction.as_data_frame(use_pandas=True),
                       mojo_prediction,
                       check_dtype=False)
Esempio n. 28
0
    def test_export_orc_hdfs(self):
        fr = h2o.import_file(
            path="hdfs://127.0.0.1/user/jenkins/prostate_NA.orc", header=1)
        export_path = "hdfs://127.0.0.1/user/jenkins/prostate_NA_export.orc"
        failure = False
        try:
            h2o.export_file(frame=fr, path=export_path, force=True)
        except:
            failure = True
        assert not failure

        imported = h2o.import_file(path=export_path, header=1)
        assert imported.ncol == fr.ncol
        assert imported.nrow == fr.nrow
Esempio n. 29
0
def test_export_xls_hdfs():
    fr = h2o.import_file(path="hdfs://127.0.0.1/user/jenkins/iris.xls",
                         header=1)
    export_path = "hdfs://127.0.0.1/user/jenkins/iris_export.xls"
    failure = False
    try:
        h2o.export_file(frame=fr, path=export_path, force=True)
    except:
        failure = True
    assert not failure

    imported = h2o.import_file(path=export_path, header=1)
    assert imported.ncol == fr.ncol
    assert imported.nrow == fr.nrow
Esempio n. 30
0
    def test_export_parquet_hdfs(self):
        fr = h2o.import_file(
            path="hdfs://127.0.0.1/user/jenkins/airlines-simple.snappy.parquet",
            header=1)
        export_path = "hdfs://127.0.0.1/user/jenkins/airlines-simple.snappy_export.parquet"
        failure = False
        try:
            h2o.export_file(frame=fr, path=export_path, force=True)
        except:
            failure = True
        assert not failure

        imported = h2o.import_file(path=export_path, header=1)
        assert imported.ncol == fr.ncol
        assert imported.nrow == fr.nrow
Esempio n. 31
0
def train(data_path, max_models, model_name):
    train_data, test_data, train_cols = prepare_data(args.data_path)
    test_cols = train_cols[:-1]
    test_cols = "quality"

    with mlflow.start_run() as run:
        print("run_id:", run.info.run_id)
        model = H2OAutoML(max_models=max_models,
                          max_runtime_secs=300,
                          seed=24,
                          nfolds=6)
        model.train(x=train_cols,
                    y=test_cols,
                    training_frame=train_data,
                    validation_frame=test_data)
        mlflow.log_param("max_models", max_models)
        mlflow.log_metric("rmse", model.leader.rmse())

        mlflow.set_tag("mlflow_version", mlflow.__version__)
        mlflow.set_tag("h2o_version", h2o.__version__)
        mlflow.set_tag("model.leader.class", qname(model.leader.__class__))
        mlflow.set_tag("model.leader.estimator_type",
                       model.leader._estimator_type)
        mlflow.set_tag("num_leaderboard_models", model.leaderboard.nrows)

        lb = get_leaderboard(model, extra_columns='ALL')
        print(lb)

        path = "leaderboard.csv"
        h2o.export_file(lb, path=path, force=True)
        mlflow.log_artifact(path)

        from tabulate import tabulate
        df = lb.as_data_frame()
        table = tabulate(df, headers="keys", tablefmt="psql", showindex=False)
        path = "leaderboard.txt"
        with open(path, "w") as f:
            f.write(table)
        mlflow.log_artifact(path)

        df = df[["model_id"]]
        with open("models.csv", "w") as f:
            df.to_csv(f, index=False, header=False)
        mlflow.log_artifact("models.csv")

        mlflow.h2o.log_model(model.leader,
                             "h2o-model",
                             registered_model_name=args.model_name)
Esempio n. 32
0
def predictions(mod,test,test_X,run_id,allV,slice_no=0):
    
   # test = h2o.import_file(data)
    
    if slice_no>0:
      test = test[0:slice_no,:]
      test_X = test_X[0:slice_no,:]      
    
    if allV is not None:
      ivd=get_all_variables_csv(allV)    
      X=check_all_variables(test, ivd, y)
    
    mod_perf=mod.model_performance(test)
              
    stats_test={}
    stats_test=model_performance_stats(mod_perf)

    n=run_id+'_test_stats.json'       
    dict_to_json(stats_test,n) 
    try:    
      cf=mod_perf.confusion_matrix(metrics=["f1","f2","f0point5","accuracy","precision","recall","specificity","absolute_mcc","min_per_class_accuracy","mean_per_class_accuracy"])
      cf_df=cf[0].table.as_data_frame()
      cf_df.to_csv(run_id+'_test_confusion_matrix.csv')
    except:
      pass

    predictions = mod.predict(test_X)
    
    predictions_df=None
        
    try:
      seq= h2o.H2OFrame.from_python(np.arange(1,(test.shape[0]+1)).tolist(), column_names=['Seer Row ID'])
      test_id=seq.cbind(test)       
      predictions_df=test_id.cbind(predictions)	#   
    except:
      try:
        seq= h2o.H2OFrame.from_python(np.arange(1,(test_X.shape[0]+1)).tolist(), column_names=['Seer Row ID'])
        test_id=seq.cbind(test_X)       
        predictions_df=test_id.cbind(predictions)	#     
      except:    
        pass      
       
    n=run_id+'_predictions.csv'     
    h2o.export_file(predictions_df, n)       # Karan's  changes    

    return
Esempio n. 33
0
def s3_import_export():
    local_frame = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    for scheme in ["s3n", "s3a"]:
        timestamp = datetime.today().utcnow().strftime("%Y%m%d-%H%M%S")
        unique_suffix = str(uuid.uuid4())
        s3_path = scheme + "://test.0xdata.com/h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                  timestamp + "." + unique_suffix + ".csv.zip"
        h2o.export_file(local_frame, s3_path)
        s3_frame = h2o.import_file(s3_path)
        assert_frame_equal(local_frame.as_data_frame(),
                           s3_frame.as_data_frame())

        #Delete the file afterwards
        s3 = boto3.resource('s3')
        s3.Object(bucket_name='test.0xdata.com', key="h2o-hadoop-tests/test-export/" + scheme + "/exported." + \
                                                     timestamp + "." + unique_suffix + ".csv.zip").delete()
Esempio n. 34
0
def export_time():
    pros_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/citibike-nyc/2013-07.csv"))

    def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
        return ''.join(random.choice(chars) for _ in range(size))

    fname = id_generator() + "_prediction.csv"

    path = pyunit_utils.locate("results")
    dname = path + "/" + fname

    start = time.time()
    h2o.export_file(pros_hex,dname)
    end = time.time()
    time_to_export = end-start
    print("Time to export is",time_to_export,"seconds.")
    assert time_to_export > 2, "File export happened instantly (less than 2 seconds). Please check if h2o.export() properly exported file"
Esempio n. 35
0
def export_time():
    pros_hex = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/citibike-nyc/2013-07.csv"))

    def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
        return ''.join(random.choice(chars) for _ in range(size))

    fname = id_generator() + "_prediction.csv"

    path = pyunit_utils.locate("results")
    dname = path + "/" + fname

    start = time.time()
    h2o.export_file(pros_hex,dname)
    end = time.time()
    time_to_export = end-start
    print("Time to export is",time_to_export,"seconds.")
    assert time_to_export > 2, "File export happened instantly (less than 2 seconds). Please check if h2o.export() properly exported file"
Esempio n. 36
0
def export_custom_separator():
    data = {'col1': [1, 2], 'col2': [3, 4]}
    expected = pd.DataFrame(data=data)
    prostate = h2o.H2OFrame(expected)

    target_default = path.join(pyunit_utils.locate("results"),
                               "export_file_default_sep.csv")
    target_custom = path.join(pyunit_utils.locate("results"),
                              "export_file_custom_sep.csv")

    h2o.export_file(prostate, target_default)
    h2o.export_file(prostate, target_custom, sep="|")

    parsed_default = pd.read_csv(target_default, sep=",")
    parsed_custom = pd.read_csv(target_custom, sep="|")

    assert expected.equals(parsed_default)
    assert expected.equals(parsed_custom)
Esempio n. 37
0
def gbm_on_hadoop():
  local_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
  hdfs_path = 'hdfs:///user/jenkins/tests/prostate_export'
  h2o.export_file(local_frame, hdfs_path, force=True)
  df = h2o.import_file(hdfs_path)
  train = df.drop("ID")
  vol = train['VOL']
  vol[vol == 0] = None
  gle = train['GLEASON']
  gle[gle == 0] = None
  train['CAPSULE'] = train['CAPSULE'].asfactor()
  my_gbm = H2OGradientBoostingEstimator(ntrees=50,
                                        learn_rate=0.1,
                                        distribution="bernoulli")
  my_gbm.train(x=list(range(1, train.ncol)),
               y="CAPSULE",
               training_frame=train,
               validation_frame=train)
  my_gbm.predict(train)
Esempio n. 38
0
    def calculate(self):
        # Get the value entered by the user into the
        # kilo_entry widget.
        ntrees_loc = int(self.ntrees_entry.get())
        min_rows_loc = int(self.min_rows_entry.get())
        max_depth_loc = int(self.max_depth_entry.get())
        self.ntrees.set(ntrees_loc)
        self.min_rows.set(min_rows_loc)
        self.max_depth.set(max_depth_loc)
        # Import the titanic dataset into H2O:
        titanic = h2o.import_file("train.csv")
        test = h2o.import_file("test.csv")
        # Set the predictors and response;
        # set the response as a factor:
        titanic["Survived"] = titanic["Survived"].asfactor()
        predictors = ['Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
        response = "Survived"
        # Split the dataset into a train and valid set:
        ratio = float(self.slider.get())
        train, valid = titanic.split_frame(ratios=[ratio], seed=1234)
        # Build and train the model:
        titanic_drf = H2ORandomForestEstimator(ntrees=ntrees_loc,
                                            max_depth=max_depth_loc,
                                            min_rows=min_rows_loc,
                                            calibrate_model=True,
                                            calibration_frame=valid,
                                            binomial_double_trees=True)
        titanic_drf.train(x=predictors,
                       y=response,
                       training_frame=train,
                       validation_frame=valid)
        # Eval performance:
        perf = titanic_drf.model_performance()
        # Generate predictions on a validation set (if necessary):
        pred = titanic_drf.predict(valid)
        pred2 = titanic_drf.predict(test)

        self.auc.set(perf.auc())
        self.logloss.set(perf.logloss())
       # print(pred2)
        h2o.export_file(pred, 'result_pred.csv')
        h2o.export_file(pred2, 'result_pred2.csv')
Esempio n. 39
0
def gbm_on_hadoop():
    local_frame = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    hdfs_path = 'hdfs:///user/jenkins/tests/prostate_export'
    h2o.export_file(local_frame, hdfs_path, force=True)
    df = h2o.import_file(hdfs_path)
    train = df.drop("ID")
    vol = train['VOL']
    vol[vol == 0] = None
    gle = train['GLEASON']
    gle[gle == 0] = None
    train['CAPSULE'] = train['CAPSULE'].asfactor()
    my_gbm = H2OGradientBoostingEstimator(ntrees=50,
                                          learn_rate=0.1,
                                          distribution="bernoulli")
    my_gbm.train(x=list(range(1, train.ncol)),
                 y="CAPSULE",
                 training_frame=train,
                 validation_frame=train)
    my_gbm.predict(train)
Esempio n. 40
0
    def auto(self):
        ntrees_loc = 400
        min_rows_loc = 10
        max_depth_loc = 9
        self.ntrees.set(ntrees_loc)
        self.min_rows.set(min_rows_loc)
        self.max_depth.set(max_depth_loc)
        # Import the titanic dataset into H2O:
        titanic = h2o.import_file("train.csv")
        test = h2o.import_file("test.csv")
        # Set the predictors and response;
        # set the response as a factor:
        titanic["Survived"] = titanic["Survived"].asfactor()
        predictors = ['Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
        response = "Survived"
        # Split the dataset into a train and valid set:
        train, valid = titanic.split_frame(ratios=[.8], seed=1234)
        # Build and train the model:
        titanic_drf = H2ORandomForestEstimator(ntrees=ntrees_loc,
                                            max_depth=max_depth_loc,
                                            min_rows=min_rows_loc,
                                            calibrate_model=True,
                                            calibration_frame=valid,
                                            binomial_double_trees=True)
        titanic_drf.train(x=predictors,
                       y=response,
                       training_frame=train,
                       validation_frame=valid)
        # Eval performance:
        perf = titanic_drf.model_performance()
        # Generate predictions on a validation set (if necessary):
        pred = titanic_drf.predict(valid)
        #DODAC DO TESTOWYCH DANYCH W calculate() TEZ JEST
        pred2 = titanic_drf.predict(test)
        pred2.describe()
        test.describe()

        self.auc.set(perf.auc())
        self.logloss.set(perf.logloss())
        h2o.export_file(pred, 'result_pred.csv')
        h2o.export_file(pred2, 'result_pred2.csv')
Esempio n. 41
0
def export_file():
    pros_hex = h2o.upload_file(
        pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    pros_hex[1] = pros_hex[1].asfactor()
    pros_hex[3] = pros_hex[3].asfactor()
    pros_hex[4] = pros_hex[4].asfactor()
    pros_hex[5] = pros_hex[5].asfactor()
    pros_hex[8] = pros_hex[8].asfactor()

    p_sid = pros_hex.runif()
    pros_train = pros_hex[p_sid > .2, :]
    pros_test = pros_hex[p_sid <= .2, :]

    glm = H2OGeneralizedLinearEstimator(family="binomial")
    myglm = glm.train(x=list(range(2, pros_hex.ncol)),
                      y=1,
                      training_frame=pros_train)
    mypred = glm.predict(pros_test)

    def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
        return ''.join(random.choice(chars) for _ in range(size))

    fname = id_generator() + "_prediction.csv"

    path = pyunit_utils.locate("results")
    dname = path + "/" + fname

    h2o.export_file(mypred, dname)

    py_pred = pd.read_csv(dname)
    print(py_pred.head())
    h_pred = mypred.as_data_frame(True)
    print(h_pred.head())

    #Test to check if py_pred & h_pred are identical
    try:
        assert_frame_equal(py_pred, h_pred)
        return True
    except:
        return False
Esempio n. 42
0
def coxph_mojo_predict_with_interactions(sandbox_dir):
    np.random.seed(1234)
    n_rows = 10
    start = np.random.choice([0, 1, 2, 3, 4], size=10)
    delta = np.random.choice([1, 2, 3], size=10)
    data = {
        "start": start,
        "stop": start + delta,
        "X1": np.random.randn(n_rows),
        "X2": np.random.randn(n_rows),
        "age": np.random.choice(["young", "old"], 10),
        "W": np.random.choice([10, 20], size=n_rows),
        "Offset": np.random.uniform(0, 1, 10),
        "Y": np.random.choice([0, 1], size=n_rows)
    }
    train = h2o.H2OFrame(pandas.DataFrame(data))
    train["age"] = train["age"].asfactor()
    h2o_model = H2OCoxProportionalHazardsEstimator(start_column="start",
                                                   stop_column="stop",
                                                   weights_column="W",
                                                   offset_column="Offset",
                                                   interactions=["X1", "X2"],
                                                   stratify_by=["age"])

    h2o_model.train(x=["X1", "X2", "age"], y="Y", training_frame=train)
    mojo = pyunit_utils.download_mojo(h2o_model)

    # export new file (including the random columns)
    input_csv = "%s/in.csv" % sandbox_dir
    h2o.export_file(train, input_csv)
    pandas_frame = pandas.read_csv(input_csv)

    h2o_prediction = h2o_model.predict(train)
    mojo_prediction = h2o.mojo_predict_pandas(dataframe=pandas_frame, **mojo)

    assert len(mojo_prediction) == h2o_prediction.nrow
    assert_frame_equal(h2o_prediction.as_data_frame(use_pandas=True),
                       mojo_prediction,
                       check_dtype=False)
Esempio n. 43
0
def h2oexport_file():
    """
    Python API test: h2o.export_file(frame, path, force=False, parts=1).  Note taht force=True is only honored if
    parts=1.  Otherwise, an error will be thrown.
    """
    training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
    try:
        results_dir = pyunit_utils.locate("results")    # find directory path to results folder
        final_path = os.path.join(results_dir, 'frameData')
        h2o.export_file(training_data, final_path, force=True, parts=1)       # save data
        assert os.path.isfile(final_path), "h2o.export_file() command is not working."
        final_dir_path = os.path.join(results_dir, 'multiFrame')
        h2o.export_file(training_data, final_dir_path, force=True, parts=-1)
        assert len(os.listdir(final_dir_path))>0, "h2o.export_file() command is not working."
    except Exception as e:
        if e.__class__.__name__=='ValueError' and 'File not found' in e.args[0]:
            print("Directory is not writable.  h2o.export_file() command is not tested.")
        else :
            assert e.__class__.__name__=='H2OResponseError' and \
                   'exportFrame: Cannot use path' in e.args[0]._props['dev_msg'], \
                "h2o.export_file() command is not working."
            print("Directory: {0} is not empty.  Delete or empy it before re-run.  h2o.export_file() "
                  "is not tested with multi-part export.".format(final_dir_path))
Esempio n. 44
0
"""

import os
import glob
import pandas
import uyulala

import h2o

h2o.init()

transformed = h2o.import_file(path=os.path.join(uyulala.dataDir, "transformed"), col_types={"DateCol": "enum"})

# predictionDF = h2o.as_list(transformed[['Symbol','DateCol']], use_pandas=True)

for modelFile in glob.glob(os.path.join(uyulala.modelsDir, "model|*")):
    fileName = modelFile.split("/")[-1]
    junk, label, modelType = fileName.split("|")
    model = h2o.h2o.load_model(path=modelFile)
    prediction = model.predict(transformed)
    # predictionDF = predictionDF.merge(h2o.as_list(prediction, use_pandas=True),left_index=True,right_index=True).rename(columns={'predict':label+'|'+modelType})
    transformed = transformed.cbind(prediction)


# predictionDF.to_csv(path=os.path.join(uyulala.dataDir,'predictions.csv'),index=False)

h2o.export_file(transformed, path=os.path.join(uyulala.dataDir, "predictions"), force=True, parts=-1)


# labels = h2o.import_file(path=os.path.join(uyulala.dataDir,'labeled'),col_types={'DateCol':'enum'})
Esempio n. 45
0
def mojo_predict_csv_test(target_dir):
    mojo_file_name = "prostate_gbm_model.zip"
    mojo_zip_path = os.path.join(target_dir, mojo_file_name)

    prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))

    r = prostate[0].runif()
    train = prostate[r < 0.70]
    test = prostate[r >= 0.70]

    # Getting first row from test data frame
    pdf = test[1, 2:]
    input_csv = "%s/in.csv" % target_dir
    output_csv = "%s/output.csv" % target_dir
    h2o.export_file(pdf, input_csv)

    # =================================================================
    # Regression
    # =================================================================
    regression_gbm1 = H2OGradientBoostingEstimator(distribution="gaussian")
    regression_gbm1.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=train)
    pred_reg = regression_gbm1.predict(pdf)
    p1 = pred_reg[0, 0]
    print("Regression prediction: " + str(p1))

    download_mojo(regression_gbm1, mojo_zip_path)

    print("\nPerforming Regression Prediction using MOJO @... " + target_dir)
    prediction_result = h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path,
                                                   output_csv_path=output_csv)
    print("Prediction result: " + str(prediction_result))
    assert p1 == float(prediction_result[0]['predict']), "expected predictions to be the same for binary and MOJO model for regression"

    # =================================================================
    # Binomial
    # =================================================================
    train[1] = train[1].asfactor()
    bernoulli_gbm1 = H2OGradientBoostingEstimator(distribution="bernoulli")

    bernoulli_gbm1.train(x=[2, 3, 4, 5, 6, 7, 8], y=1, training_frame=train)
    pred_bin = bernoulli_gbm1.predict(pdf)

    binary_prediction_0 = pred_bin[0, 1]
    binary_prediction_1 = pred_bin[0, 2]
    print("Binomial prediction: p0: " + str(binary_prediction_0))
    print("Binomial prediction: p1: " + str(binary_prediction_1))

    download_mojo(bernoulli_gbm1, mojo_zip_path)

    print("\nPerforming Binomial Prediction using MOJO @... " + target_dir)
    prediction_result = h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path,
                                                   output_csv_path=output_csv)

    mojo_prediction_0 = float(prediction_result[0]['0'])
    mojo_prediction_1 = float(prediction_result[0]['1'])
    print("Binomial prediction: p0: " + str(mojo_prediction_0))
    print("Binomial prediction: p1: " + str(mojo_prediction_1))

    assert binary_prediction_0 == mojo_prediction_0, "expected predictions to be the same for binary and MOJO model for Binomial - p0"
    assert binary_prediction_1 == mojo_prediction_1, "expected predictions to be the same for binary and MOJO model for Binomial - p1"

    # =================================================================
    # Multinomial
    # =================================================================
    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris.csv"))

    r = iris[0].runif()
    train = iris[r < 0.90]
    test = iris[r >= 0.10]

    # Getting first row from test data frame
    pdf = test[1, 0:4]
    input_csv = "%s/in-multi.csv" % target_dir
    output_csv = "%s/output.csv" % target_dir
    h2o.export_file(pdf, input_csv)

    multi_gbm = H2OGradientBoostingEstimator()
    multi_gbm.train(x=['C1', 'C2', 'C3', 'C4'], y='C5', training_frame=train)

    pred_multi = multi_gbm.predict(pdf)
    multinomial_prediction_1 = pred_multi[0, 1]
    multinomial_prediction_2 = pred_multi[0, 2]
    multinomial_prediction_3 = pred_multi[0, 3]
    print("Multinomial prediction (Binary): p0: " + str(multinomial_prediction_1))
    print("Multinomial prediction (Binary): p1: " + str(multinomial_prediction_2))
    print("Multinomial prediction (Binary): p2: " + str(multinomial_prediction_3))

    download_mojo(multi_gbm, mojo_zip_path)

    print("\nPerforming Binomial Prediction using MOJO @... " + target_dir)
    prediction_result = h2o_utils.mojo_predict_csv(input_csv_path=input_csv, mojo_zip_path=mojo_zip_path,
                                                   output_csv_path=output_csv)

    mojo_prediction_1 = float(prediction_result[0]['Iris-setosa'])
    mojo_prediction_2 = float(prediction_result[0]['Iris-versicolor'])
    mojo_prediction_3 = float(prediction_result[0]['Iris-virginica'])
    print("Multinomial prediction (MOJO): p0: " + str(mojo_prediction_1))
    print("Multinomial prediction (MOJO): p1: " + str(mojo_prediction_2))
    print("Multinomial prediction (MOJO): p2: " + str(mojo_prediction_3))

    assert multinomial_prediction_1 == mojo_prediction_1, "expected predictions to be the same for binary and MOJO model for Multinomial - p0"
    assert multinomial_prediction_2 == mojo_prediction_2, "expected predictions to be the same for binary and MOJO model for Multinomial - p1"
    assert multinomial_prediction_3 == mojo_prediction_3, "expected predictions to be the same for binary and MOJO model for Multinomial - p2"
def local_and_hdfs_frame_equality():
  local_frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
  hdfs_path = 'hdfs:///user/jenkins/tests/prostate_export'
  h2o.export_file(local_frame, hdfs_path, force=True)
  hdfs_frame = h2o.import_file(hdfs_path)
  assert_frame_equal(local_frame.as_data_frame(), hdfs_frame.as_data_frame())
Esempio n. 47
0
    assert all([x not in testing_grids for x in training_grids])
    # create vector allocating every obs to training or testing:
    training = np.array([x in training_grids for x in autocor])
    print "Proportion of data in training", sum(training)/len(training), "and prop_train =", prop_train

  # assert round(sum(training)/len(training), 2) == prop_train or round(sum(training)/len(training), 2) == prop_train + 1 or round(sum(training)/len(training), 2) == prop_train - 1
  print "len(training):", len(training) 
  print "d.dim[0]:", d.dim[0]
  #assert len(training) == d.dim[0]
  # Save to csv to then load into h2o later:
  print "Starting to save to csv format..."
  training = pd.DataFrame(training)
  training.to_csv(save_training_ind_fp, header=False, index=False)
  print "Done with saving training and testing sets for training data."

h2o.export_file(frame = d, path = save_training_data_fp, force=True)
h2o.export_file(frame = holdout, path = save_holdout_data_fp, force=True)

# Send email
email = False
if(email):
  import smtplib
  GMAIL_USERNAME = None
  GMAIL_PW = None
  RECIP = None
  SMTP_NUM = None
  session = smtplib.SMTP('smtp.gmail.com', SMTP_NUM)
  session.ehlo()
  session.starttls()
  session.login(GMAIL_USERNAME, GMAIL_PW)
  headers = "\r\n".join(["from: " + GMAIL_USERNAME,
Esempio n. 48
0
GP_lag = data['GP_lag']
GP_lag[GP_lag==9999] = None
data['GP_lag'] =GP_lag 

PSN_lag = data['PSN_lag']
PSN_lag[PSN_lag==9999] = None
data['PSN_lag'] = PSN_lag 

nino34_lag = data['nino34_lag']
nino34_lag[nino34_lag==9999] = None
data['nino34_lag'] = nino34_lag 

# h2o.remove([LST_lag, NDVI_lag, EVI_lag, EVI, PixelReliability, FPAR_lag, LAI_lag, GP_lag, PSN_lag, nino34_lag])
# del LST_lag, NDVI_lag, EVI_lag, EVI, PixelReliability, FPAR_lag, LAI_lag, GP_lag, PSN_lag, nino34_lag

h2o.export_file(frame = data, path = save_data_fp, force=True)

# Send email
email = False
if(email):
  import smtplib
  GMAIL_USERNAME = None
  GMAIL_PW = None
  RECIP = None
  SMTP_NUM = None
  session = smtplib.SMTP('smtp.gmail.com', SMTP_NUM)
  session.ehlo()
  session.starttls()
  session.login(GMAIL_USERNAME, GMAIL_PW)
  headers = "\r\n".join(["from: " + GMAIL_USERNAME,
                         "subject: " + "Finished running script: " + __file__,