def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_iris_import_types_orc.py"))
            pass
        else:

            numElements2Compare = 100
            tol_time = 200
            tol_numeric = 1e-5

            hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            h2oframe_csv = h2o.import_file(url_csv)
            data_types = ['real', 'real', 'real', 'real', 'enum']
            h2oframe_orc = h2o.import_file(url_orc, col_types = data_types)

            # compare the two frames
            assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric,
                                               True), "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
Exemple #2
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        tol_time = 200  # comparing in ms or ns
        tol_numeric = 1e-5  # tolerance for comparing other numeric fields
        numElements2Compare = 0  # choose number of elements per column to compare.  Save test time.

        hdfs_csv_file = "/datasets/orc_parser/synthetic_perfect_separation_csv"
        hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc"

        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        multi_file_csv = h2o.import_file(url_csv)
        multi_file_orc = h2o.import_file(url_orc)

        # make sure orc multi-file and single big file create same H2O frame
        assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv, numElements2Compare, tol_time,
                                           tol_numeric,True), "H2O frame parsed from multiple orc and single orc " \
                                                              "files are different!"
    else:
        raise EnvironmentError
Exemple #3
0
def xgboost_estimation():
    if ("XGBoost" not in h2o.cluster().list_all_extensions()):
        print("XGBoost extension is not present.  Skipping test. . .")
        return

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if not hadoop_namenode_is_accessible:
        raise EnvironmentError("Hadoop namenode is not accessible")

    hdfs_name_node = pyunit_utils.hadoop_namenode()

    full_data = createData(500000, 500)

    myX = list(full_data.col_names)
    myX.remove("IsDepDelayed")

    xgb = H2OXGBoostEstimator(seed=42, tree_method="approx")
    xgboost_model = xgb.train(y="IsDepDelayed",
                              x=myX[0:480],
                              training_frame=full_data,
                              model_id="xgboost")

    print(xgboost_model)

    pred = predict(xgboost_model, full_data)
    perf = h2o.performance(xgboost_model, full_data)
    return perf
Exemple #4
0
def createData(nrows, ncols):
    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hdfs_airlines_file = "/datasets/airlines_all.05p.csv"

    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_file)
    airlines = h2o.import_file(url)

    myX = ["Year", "Month", "DayofMonth", "DayOfWeek", "Distance"]
    myY = "IsDepDelayed"

    allCols = list(myX)
    allCols.append(myY)

    airlines = airlines[allCols]

    num_new_features = ncols - airlines.ncol
    sample_data = h2o.create_frame(rows=nrows,
                                   cols=num_new_features,
                                   categorical_fraction=0,
                                   seed=1234,
                                   seed_for_column_types=1234)

    new_rows = nrows - airlines.nrow
    if (nrows > 0):
        extra_rows = airlines[0:nrows, :]
        airlines = airlines.rbind(extra_rows)

    airlines = airlines[0:nrows, :]
    full_data = airlines.cbind(sample_data)

    return full_data
 def test_frame_reload(self):
     name_node = pyunit_utils.hadoop_namenode()
     work_dir = "hdfs://%s%s" % (name_node, utils.get_workdir())
     dataset = "/datasets/mnist/train.csv.gz"
     
     try:
         cluster_1 = utils.start_cluster("saver")
         h2o.connect(url=cluster_1)
         df_orig = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset))
         df_key = df_orig.key
         df_pd_orig = df_orig.as_data_frame()
         df_orig.save(work_dir)
         h2o.connection().close()
     finally:
         utils.stop_cluster("saver")
     
     try:
         cluster_2 = utils.start_cluster("loader")
         h2o.connect(url=cluster_2)
         df_loaded = h2o.load_frame(df_key, work_dir)
         df_pd_loaded = df_loaded.as_data_frame()
         h2o.connection().close()
     finally:
         utils.stop_cluster("loader")
     
     self.assertTrue(df_pd_orig.equals(df_pd_loaded))
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        # run a quick test to determine if the hive-exec is too old.

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_prostate_orc.py"))
            pass
        else:

            tol_time = 200              # comparing in ms or ns
            tol_numeric = 1e-5          # tolerance for comparing other numeric fields
            numElements2Compare = 10   # choose number of elements per column to compare.  Save test time.

            hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc"
            hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            h2oOrc = h2o.import_file(url_orc)
            h2oCsv = h2o.import_file(url_csv)

            # compare the two frames
            assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
Exemple #7
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        tol_time = 200              # comparing in ms or ns
        tol_numeric = 1e-5          # tolerance for comparing other numeric fields
        numElements2Compare = 100   # choose number of elements per column to compare.  Save test time.

        allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc",
                       "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc",
                       "/datasets/orc_parser/orc/orc_split_elim.orc"]

        allCsvFiles = ["/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv",
                       "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv",
                       "/datasets/orc_parser/csv/orc_split_elim.csv"]

        for fIndex in range(len(allOrcFiles)):
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex])
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex])
            h2oOrc = h2o.import_file(url_orc)
            h2oCsv = h2o.import_file(url_csv)

            # compare the two frames
            assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
          "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/milsongs_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/milsongs_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            multi_file_csv = h2o.import_file(url_csv)
            multi_file_orc = h2o.import_file(url_orc)

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        numElements2Compare = 100
        tol_time = 200
        tol_numeric = 1e-5

        hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv"
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        h2oframe_csv = h2o.import_file(url_csv)
        data_types = ['real', 'real', 'real', 'real', 'enum']
        h2oframe_orc = h2o.import_file(url_orc, col_types = data_types)

        # compare the two frames
        assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric,
                                           True), "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
Exemple #10
0
def hive_import():
    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hive_host = os.getenv("HIVE_HOST")
    connection_url = "jdbc:hive2://{0}:10000/default".format(hive_host)
    krb_enabled = os.getenv('KRB_ENABLED', 'false').lower() == 'true'
    if krb_enabled:
        connection_url += ";auth=delegationToken"

    # read original
    file_url = "hdfs://{0}{1}".format(
        hdfs_name_node, "/user/jenkins/smalldata/chicago/chicagoCensus.csv")
    dataset_original = h2o.import_file(file_url)

    # read TABLE from Hive JDBC
    table_jdbc = h2o.import_sql_table(connection_url,
                                      "chicago",
                                      "",
                                      "",
                                      fetch_mode="SINGLE")
    table_jdbc = adapt_frame(table_jdbc, column_prefix="chicago.")
    pyunit_utils.compare_frames_local(dataset_original, table_jdbc, prob=1)

    # read TABLE from Hive FS
    table_direct = h2o.import_hive_table(connection_url, "chicago")
    table_direct = adapt_frame(table_direct)
    pyunit_utils.compare_frames_local(dataset_original, table_direct, prob=1)
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_orc_file = "/datasets/orc_parser/milsongs_orc"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        hdfs_csv_file = "/datasets/orc_parser/milsongs_csv"
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        multi_file_csv = h2o.import_file(url_csv)
        multi_file_orc = h2o.import_file(url_orc)

        multi_file_csv.summary()
        csv_summary = h2o.frame(
            multi_file_csv.frame_id)["frames"][0]["columns"]

        multi_file_orc.summary()
        orc_summary = h2o.frame(
            multi_file_orc.frame_id)["frames"][0]["columns"]

        pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
Exemple #12
0
def xgb_repro():
    name_node = pyunit_utils.hadoop_namenode()
    data = h2o.import_file(
        "hdfs://" + name_node +
        "/user/jenkins/bigdata/laptop/airlinesBillion_7Columns_5GB.csv",
        na_strings=["NA"])

    train, test = data.split_frame(ratios=[0.99], seed=1)
    x = data.names
    y = "C31"
    x.remove(y)

    model = H2OXGBoostEstimator(ntrees=5,
                                max_depth=6,
                                learn_rate=0.1,
                                seed=12345,
                                backend="CPU")
    model.train(x=x, y=y, training_frame=train)
    p1 = model.predict(test)
    model.train(x=x, y=y, training_frame=train)
    p2 = model.predict(test)
    p = p1.cbind(p2)

    diff = (p[1] != p[4]).as_data_frame()
    ndiffs = 0
    for i in range(len(diff) - 1):
        if diff.iat[i, 0] != 0:
            ndiffs += 1

    assert ndiffs == 0, "diffs %d out of %d rows" % (ndiffs, p1.nrows)
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        assert pyunit_utils.expect_warnings(url_orc, "UserWarning:", "Skipping field:", 1),\
            "Expect warnings from orc parser for file "+url_orc+"!"

        hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        assert pyunit_utils.expect_warnings(url_orc, "UserWarning:", "Skipping field:", 1), \
            "Expect warnings from orc parser for file "+url_orc+"!"

        hdfs_orc_file = "/datasets/orc_parser/orc/nulls-at-end-snappy.orc"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        assert pyunit_utils.expect_warnings(url_orc, "UserWarning:", "Skipping field:", 1), \
            "Expect warnings from orc parser for file "+url_orc+"!"

    else:
        raise EnvironmentError
Exemple #14
0
def hdfs_kmeans():
    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
    hdfs_covtype_file = "/datasets/runit/covtype.data"

    print("Import iris_wheader.csv from HDFS")
    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
    iris_h2o = h2o.import_file(url)
    n = iris_h2o.nrow
    print("rows: {0}".format(n))
    assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)

    print("Running KMeans on iris")
    iris_km = H2OKMeansEstimator(k=3, training_frame=iris_h2o[0:4], max_iterations=10)
    iris_km.train()
    print(iris_km)

    print("Importing covtype.data from HDFS")
    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file)
    covtype_h2o = h2o.import_file(url)
    n = covtype_h2o.nrow
    print("rows: {0}".format(n))
    assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012)

    print("Running KMeans on covtype")
    covtype_km = H2OKMeansEstimator(training_frame=covtype_h2o[0:55], k=8, max_iterations=10)
    covtype_km.train()
    print(covtype_km)
Exemple #15
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/milsongs_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/milsongs_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            multi_file_csv = h2o.import_file(url_csv)
            multi_file_orc = h2o.import_file(url_orc)

            multi_file_csv.summary()
            csv_summary = h2o.frame(
                multi_file_csv.frame_id)["frames"][0]["columns"]

            multi_file_orc.summary()
            orc_summary = h2o.frame(
                multi_file_orc.frame_id)["frames"][0]["columns"]

            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        hdfs_orc_file = "/datasets/orc_parser/air05_orc"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        hdfs_csv_file = "/datasets/orc_parser/air05_csv"
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        startcsv = time.time()
        multi_file_csv = h2o.import_file(hdfs_csv_file, na_strings=['\\N'])
        endcsv = time.time()

        csv_type_dict = multi_file_csv.types

        multi_file_csv.summary()
        csv_summary = h2o.frame(
            multi_file_csv.frame_id)["frames"][0]["columns"]

        col_ind_name = dict()
        # change column types from real to enum according to multi_file_csv column types
        for key_name in list(csv_type_dict):
            col_ind = key_name.split('C')
            new_ind = int(str(col_ind[1])) - 1
            col_ind_name[new_ind] = key_name

        col_types = []
        for ind in range(len(col_ind_name)):
            col_types.append(csv_type_dict[col_ind_name[ind]])

        startorc1 = time.time()
        multi_file_orc1 = h2o.import_file(url_orc)
        endorc1 = time.time()
        h2o.remove(multi_file_orc1)

        startorc = time.time()
        multi_file_orc = h2o.import_file(url_orc, col_types=col_types)
        endorc = time.time()

        multi_file_orc.summary()
        orc_summary = h2o.frame(
            multi_file_orc.frame_id)["frames"][0]["columns"]

        print("************** CSV parse time is {0}".format(endcsv - startcsv))
        print(
            "************** ORC (without column type forcing) parse time is {0}"
            .format(endorc1 - startorc1))
        print(
            "************** ORC (with column type forcing) parse time is {0}".
            format(endorc - startorc))
        # compare frame read by orc by forcing column type,
        pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def hdfs_import_bigCat():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        numTimes = 10
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        allFiles = [
            "/datasets/bigCatFiles/tenThousandCat10C.csv",
            "/datasets/bigCatFiles/hundredThousandCat10C.csv",
            "/datasets/bigCatFiles/oneMillionCat10C.csv",
            "/datasets/bigCatFiles/tenThousandCat50C.csv",
            "/datasets/bigCatFiles/hundredThousandCat50C.csv",
            "/datasets/bigCatFiles/tenThousandCat100C.csv",
            "/datasets/bigCatFiles/hundredThousandCat100C.csv",
            "/datasets/bigCatFiles/oneMillionCat50C.csv"
        ]
        reps = [10, 10, 10, 50, 50, 100, 100, 50]

        for ind in range(
                0, len(allFiles
                       )):  # run tests for 3 different sizes per Tomas request
            print("*******  Parsing file {0} ********".format(allFiles[ind]))
            runPerformance(
                "hdfs://{0}{1}".format(hdfs_name_node, allFiles[ind]),
                numTimes, reps[ind])

    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        tol_time = 200              # comparing in ms or ns
        tol_numeric = 1e-5          # tolerance for comparing other numeric fields
        numElements2Compare = 10   # choose number of elements per column to compare.  Save test time.

        hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc"
        hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        h2oOrc = h2o.import_file(url_orc)
        h2oCsv = h2o.import_file(url_csv)

        # compare the two frames
        assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
            "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
def hdfs_kmeans_airlines():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_file = "/datasets/airlines_all.csv"

        print "Import airlines_all.csv from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file)
        airlines_h2o = h2o.import_file(url)
        n = airlines_h2o.nrow
        print "rows: {0}".format(n)

        print "Run k-means++ with k = 7 and max_iterations = 10"
        myX = range(8) + range(11, 16) + range(18, 21) + range(24, 29) + [9]
        airlines_km = h2o.kmeans(training_frame=airlines_h2o,
                                 x=airlines_h2o[myX],
                                 k=7,
                                 init="Furthest",
                                 max_iterations=10,
                                 standardize=True)
        print airlines_km
    else:
        raise (EnvironmentError,
               "Not running on H2O internal network.  No access to HDFS.")
def xgboost_estimation():
    if ("XGBoost" not in h2o.cluster().list_all_extensions()):
        print("XGBoost extension is not present.  Skipping test. . .")
        return
     

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if not hadoop_namenode_is_accessible:
        raise EnvironmentError("Hadoop namenode is not accessible")

    hdfs_name_node = pyunit_utils.hadoop_namenode()

    full_data = createData(500000, 500)

    myX = list(full_data.col_names)
    myX.remove("IsDepDelayed")

    xgb = H2OXGBoostEstimator(seed = 42, tree_method = "approx")
    xgboost_model = xgb.train(y = "IsDepDelayed", x = myX[0:480], training_frame = full_data, model_id = "xgboost")

    print(xgboost_model)

    pred = predict(xgboost_model, full_data)
    perf = h2o.performance(xgboost_model, full_data)
    return perf
Exemple #21
0
    def test_frame_reload(self):
        name_node = pyunit_utils.hadoop_namenode()
        work_dir = "hdfs://%s%s" % (name_node, utils.get_workdir())
        dataset = "/datasets/iris_wheader.csv"

        ntrees_opts = [100, 120, 130, 140]
        learn_rate_opts = [0.01, 0.02, 0.03, 0.04]
        grid_size = len(ntrees_opts) * len(learn_rate_opts)
        print("max models %s" % grid_size)
        grid_id = "grid_ft_resume"
        hyper_parameters = {
            "learn_rate": learn_rate_opts,
            "ntrees": ntrees_opts
        }

        cluster_1_name = "grid1-py"
        try:
            cluster_1 = utils.start_cluster(cluster_1_name)
            h2o.connect(url=cluster_1)
            train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset))
            grid = H2OGridSearch(H2OGradientBoostingEstimator,
                                 grid_id=grid_id,
                                 hyper_params=hyper_parameters,
                                 recovery_dir=work_dir)
            print("starting initial grid and sleeping...")
            grid.start(x=list(range(4)), y=4, training_frame=train)
            grid_in_progress = None
            times_waited = 0
            while (times_waited < 20) and (grid_in_progress is None or len(
                    grid_in_progress.model_ids) == 0):
                time.sleep(5)  # give it tome to train some models
                times_waited += 1
                try:
                    grid_in_progress = h2o.get_grid(grid_id)
                except IndexError:
                    print("no models trained yet")
            print("done sleeping")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_1_name)

        cluster_2_name = "grid2-py"
        try:
            cluster_2 = utils.start_cluster(cluster_2_name)
            h2o.connect(url=cluster_2)
            loaded = h2o.load_grid("%s/%s" % (work_dir, grid_id),
                                   load_params_references=True)
            print("models after first run:")
            for x in sorted(loaded.model_ids):
                print(x)
            loaded.resume()
            print("models after second run:")
            for x in sorted(loaded.model_ids):
                print(x)
            print("Newly grained grid has %d models" % len(loaded.model_ids))
            self.assertEqual(len(loaded.model_ids), grid_size,
                             "The full grid was not trained.")
            h2o.connection().close()
        finally:
            utils.stop_cluster(cluster_2_name)
def createData(nrows, ncols):
    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hdfs_airlines_file = "/datasets/airlines_all.05p.csv"

    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_file)
    airlines = h2o.import_file(url)

    myX = ["Year", "Month", "DayofMonth", "DayOfWeek", "Distance"]
    myY = "IsDepDelayed"

    allCols = list(myX)
    allCols.append(myY)

    airlines = airlines[allCols]

    num_new_features = ncols - airlines.ncol
    sample_data = h2o.create_frame(rows = nrows, cols = num_new_features, categorical_fraction = 0,
                                  seed = 1234, seed_for_column_types = 1234)

    new_rows = nrows - airlines.nrow
    if (nrows > 0):
      extra_rows = airlines[0:nrows, : ]
      airlines = airlines.rbind(extra_rows)

    airlines = airlines[0:nrows, : ]
    full_data = airlines.cbind(sample_data)

    return full_data
def directory_import():

    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        url1 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/iris1.csv")
        url2 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/")
        print "Importing HDFS file {0} and directory {1}".format(url1, url2)
        frm_one = h2o.import_file(url1)
        frm_all = h2o.import_file(url2)

        r1, c1 = frm_one.dim
        ra, ca = frm_all.dim

        assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra)
        assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)
    else:
        raise(EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")

    small1 = pyunit_utils.locate("smalldata/jira/identical_files/iris1.csv")
    small2 = small1.split("iris1.csv")[0]
    print "Importing smalldata file {0} and directory {1}".format(small1, small2)
    frm_one = h2o.import_file(small1)
    frm_all = h2o.import_file(small2)

    r1, c1 = frm_one.dim
    ra, ca = frm_all.dim

    assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra)
    assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)
Exemple #24
0
def hdfs_basic():
    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
    hdfs_iris_dir = "/datasets/runit/iris_test_train"

    print("Testing single file importHDFS")
    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
    iris_h2o = h2o.import_file(url)
    n = iris_h2o.nrow
    print("rows: {0}".format(n))
    assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(
        n, 150)
    assert isinstance(
        iris_h2o,
        h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format(
            type(iris_h2o))
    print("Import worked")

    print("Testing directory importHDFS")
    urls = [
        "hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node, hdfs_iris_dir),
        "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node, hdfs_iris_dir)
    ]
    iris_dir_h2o = h2o.import_file(urls)
    iris_dir_h2o.head()
    iris_dir_h2o.tail()
    n = iris_dir_h2o.nrow
    print("rows: {0}".format(n))
    assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(
        n, 150)
    assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\
        format(type(iris_dir_h2o))
    print("Import worked")
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_airline_05_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/air05_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/air05_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            startcsv = time.time()
            multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'])
            endcsv = time.time()

            csv_type_dict = multi_file_csv.types

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            col_ind_name = dict()
            # change column types from real to enum according to multi_file_csv column types
            for key_name in list(csv_type_dict):
                col_ind = key_name.split('C')
                new_ind = int(str(col_ind[1]))-1
                col_ind_name[new_ind] = key_name

            col_types = []
            for ind in range(len(col_ind_name)):
                col_types.append(csv_type_dict[col_ind_name[ind]])

            startorc1 = time.time()
            multi_file_orc1 = h2o.import_file(url_orc)
            endorc1 = time.time()
            h2o.remove(multi_file_orc1)

            startorc = time.time()
            multi_file_orc = h2o.import_file(url_orc,col_types=col_types)
            endorc = time.time()

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            print("************** CSV parse time is {0}".format(endcsv-startcsv))
            print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1))
            print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc))
            # compare frame read by orc by forcing column type,
            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        numElements2Compare = 10
        tol_time = 200
        tol_numeric = 1e-5

        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_csv_file = "/datasets/air_csv_part"

        col_types = ['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'enum', 'real', 'enum', 'real',
                         'real', 'enum', 'real', 'real', 'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real',
                         'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum']

            # import CSV file
        print("Import airlines 116M dataset in original csv format from HDFS")
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)
        acs_orig = h2o.import_file(url_csv, na_strings=['\\N'], col_types=col_types)
        print("Data size number of rows: {0}, number of columns: {1}".format(acs_orig.nrow, acs_orig.ncol))

        seeds = [2297378124, 3849570216, 6733652048, 8915337442, 8344418400, 9416580152, 2598632624, 4977008454, 8273228579,
            8185554539, 3219125000, 2998879373, 7707012513, 5786923379, 5029788935, 935945790, 7092607078, 9305834745,
            6173975590, 5397294255]
        run_time_ms = []
        iterations = []
        objective = []
        num_runs = 1         # number of times to repeat experiments

        for ind in range(num_runs):
            acs_model = H2OGeneralizedLowRankEstimator(k = 10,
                                               transform = 'STANDARDIZE',
                                               loss = 'Quadratic',
                                               multi_loss="Categorical",
                                               model_id="clients_core_glrm",
                                               regularization_x="L2",
                                               regularization_y="L1",
                                               gamma_x=0.2,
                                               gamma_y=0.5,
                                               init="SVD",
                                               max_iterations = 200,
                                               seed=seeds[ind % len(seeds)])
            acs_model.train(x = acs_orig.names, training_frame= acs_orig, seed=seeds[ind % len(seeds)])
            run_time_ms.append(acs_model._model_json['output']['end_time'] - acs_model._model_json['output']['start_time'])
            iterations.append(acs_model._model_json['output']['iterations'])
            objective.append(acs_model._model_json['output']['objective'])

        print("Run time in ms: {0}".format(run_time_ms))
        print("number of iterations: {0}".format(iterations))
        print("objective function value: {0}".format(objective))
        sys.stdout.flush()
    else:
        raise EnvironmentError
def kmeans_mllib():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print("Import BigCross.data from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow

        err_mllib = np.genfromtxt(
            pyunit_utils.locate("smalldata/mllib_bench/bigcross_wcsse.csv"),
            delimiter=",",
            skip_header=1)
        ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))]

        for k in ncent:
            print(
                "Run k-means++ with k = {0} and max_iterations = 10".format(k))
            cross_km = h2o.kmeans(training_frame=cross_h2o,
                                  x=cross_h2o,
                                  k=k,
                                  init="PlusPlus",
                                  max_iterations=10,
                                  standardize=False)

            clust_mllib = np.genfromtxt(
                pyunit_utils.locate("smalldata/mllib_bench/bigcross_centers_" +
                                    str(k) + ".csv"),
                delimiter=",").tolist()
            clust_h2o = cross_km.centers()

            # Sort in ascending order by first dimension for comparison purposes
            clust_mllib.sort(key=lambda x: x[0])
            clust_h2o.sort(key=lambda x: x[0])

            print("\nMLlib Cluster Centers:\n")
            print(clust_mllib)
            print("\nH2O Cluster Centers:\n")
            print(clust_h2o)

            wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1]
            wcsse_h2o = old_div(cross_km.tot_withinss(), n)
            print("\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib))
            print("H2O Average Within-Cluster SSE: \n".format(wcsse_h2o))
            assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \
                                             "got {1}".format(wcsse_mllib, wcsse_h2o)
    else:
        raise EnvironmentError
Exemple #28
0
def hdfs_basic():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_iris_dir = "/datasets/runit/iris_test_train"

        #----------------------------------------------------------------------
        # Single file cases.
        #----------------------------------------------------------------------

        print "Testing single file importHDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_file(url)
        iris_h2o.head()
        iris_h2o.tail()
        n = iris_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(
            n, 150)
        assert isinstance(
            iris_h2o,
            h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format(
                type(iris_h2o))
        print "Import worked"

        #----------------------------------------------------------------------
        # Directory file cases.
        #----------------------------------------------------------------------

        print "Testing directory importHDFS"
        urls = [
            "hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node,
                                                 hdfs_iris_dir),
            "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node,
                                                  hdfs_iris_dir)
        ]
        iris_dir_h2o = h2o.import_file(urls)
        iris_dir_h2o.head()
        iris_dir_h2o.tail()
        n = iris_dir_h2o.nrow
        print "rows: {0}".format(n)
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(
            n, 150)
        assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\
            format(type(iris_dir_h2o))
        print "Import worked"
    else:
        raise (EnvironmentError,
               "Not running on H2O internal network.  No access to HDFS.")
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        numElements2Compare = 10
        tol_time = 200
        tol_numeric = 1e-5

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_orc_parser.py"))
            pass
        else:

            allOrcFiles = [
                "/datasets/orc_parser/orc/TestOrcFile.columnProjection.orc",
                "/datasets/orc_parser/orc/bigint_single_col.orc",
                "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc",
                "/datasets/orc_parser/orc/bool_single_col.orc",
                "/datasets/orc_parser/orc/demo-11-zlib.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc",
                "/datasets/orc_parser/orc/demo-12-zlib.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc",
                "/datasets/orc_parser/orc/double_single_col.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV11.orc",
                "/datasets/orc_parser/orc/float_single_col.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV12.orc",
                "/datasets/orc_parser/orc/int_single_col.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testPredicatePushdown.orc",
                "/datasets/orc_parser/orc/nulls-at-end-snappy.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testSnappy.orc",
                "/datasets/orc_parser/orc/orc_split_elim.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testStripeLevelStats.orc",
                "/datasets/orc_parser/orc/smallint_single_col.orc",
                "/datasets/orc_parser/orc/string_single_col.orc",
                "/datasets/orc_parser/orc/tinyint_single_col.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testWithoutIndex.orc"
            ]

            for fIndex in range(len(allOrcFiles)):
                url_orc = "hdfs://{0}{1}".format(hdfs_name_node,
                                                 allOrcFiles[fIndex])
                tab_test = h2o.import_file(url_orc)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format(
                      "pyunit_INTERNAL_HDFS_timestamp_date_orc.py"))
            pass
        else:
            origTZ = h2o.cluster().timezone
            newZone = 'America/Los_Angeles'
            h2o.cluster().timezone = newZone
            tol_time = 200  # comparing in ms or ns
            tol_numeric = 1e-5  # tolerance for comparing other numeric fields
            numElements2Compare = 100  # choose number of elements per column to compare.  Save test time.

            allOrcFiles = [
                "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc",
                "/datasets/orc_parser/orc/orc_split_elim.orc"
            ]

            allCsvFiles = [
                "/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv",
                "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv",
                "/datasets/orc_parser/csv/orc_split_elim.csv"
            ]

            for fIndex in range(len(allOrcFiles)):
                url_orc = "hdfs://{0}{1}".format(hdfs_name_node,
                                                 allOrcFiles[fIndex])
                url_csv = "hdfs://{0}{1}".format(hdfs_name_node,
                                                 allCsvFiles[fIndex])
                h2oOrc = h2o.import_file(url_orc)
                h2oCsv = h2o.import_file(url_csv)

                # compare the two frames
                assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                    "H2O frame parsed from orc and csv files are different!"

            h2o.cluster().timezone = origTZ
    else:
        raise EnvironmentError
Exemple #31
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_baddata_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            print("Parsing the orc file {0}".format(url_orc))
            assert pyunit_utils.expect_warnings(
                url_orc,
                warn_phrase="UserWarning:",
                warn_string_of_interest="Skipping field:",
                in_hdfs=True,
                number_of_times=1
            ), "Expect warnings from orc parser for file " + url_orc + "!"

            hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            print("Parsing the orc file {0}".format(url_orc))
            assert pyunit_utils.expect_warnings(
                url_orc,
                warn_phrase="UserWarning:",
                warn_string_of_interest="Skipping field:",
                in_hdfs=True,
                number_of_times=1
            ), "Expect warnings from orc parser for file " + url_orc + "!"

            hdfs_orc_file = "/datasets/orc_parser/orc/nulls-at-end-snappy.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            print("Parsing the orc file {0}".format(url_orc))
            assert pyunit_utils.expect_warnings(
                url_orc,
                warn_phrase="UserWarning:",
                warn_string_of_interest="Long.MIN_VALUE:",
                in_hdfs=True,
                number_of_times=1
            ), "Expect warnings from orc parser for file " + url_orc + "!"

    else:
        raise EnvironmentError
def kmeans_mllib():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print "Import BigCross.data from HDFS"
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow

        err_mllib = np.genfromtxt(
            pyunit_utils.locate("smalldata/mllib_bench/bigcross_wcsse.csv"), delimiter=",", skip_header=1
        )
        ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))]

        for k in ncent:
            print "Run k-means++ with k = {0} and max_iterations = 10".format(k)
            cross_km = h2o.kmeans(
                training_frame=cross_h2o, x=cross_h2o, k=k, init="PlusPlus", max_iterations=10, standardize=False
            )

            clust_mllib = np.genfromtxt(
                pyunit_utils.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) + ".csv"), delimiter=","
            ).tolist()
            clust_h2o = cross_km.centers()

            # Sort in ascending order by first dimension for comparison purposes
            clust_mllib.sort(key=lambda x: x[0])
            clust_h2o.sort(key=lambda x: x[0])

            print "\nMLlib Cluster Centers:\n"
            print clust_mllib
            print "\nH2O Cluster Centers:\n"
            print clust_h2o

            wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1]
            wcsse_h2o = cross_km.tot_withinss() / n
            print "\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib)
            print "H2O Average Within-Cluster SSE: \n".format(wcsse_h2o)
            assert wcsse_h2o == wcsse_mllib, (
                "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O "
                "got {1}".format(wcsse_mllib, wcsse_h2o)
            )
    else:
        raise (EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        numElements2Compare = 10
        tol_time = 200
        tol_numeric = 1e-5

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_orc_parser.py"))
            pass
        else:

            allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.columnProjection.orc",
                           "/datasets/orc_parser/orc/bigint_single_col.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc",
                           "/datasets/orc_parser/orc/bool_single_col.orc",
                           "/datasets/orc_parser/orc/demo-11-zlib.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc",
                           "/datasets/orc_parser/orc/demo-12-zlib.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc",
                           "/datasets/orc_parser/orc/double_single_col.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV11.orc",
                           "/datasets/orc_parser/orc/float_single_col.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV12.orc",
                           "/datasets/orc_parser/orc/int_single_col.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testPredicatePushdown.orc",
                           "/datasets/orc_parser/orc/nulls-at-end-snappy.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testSnappy.orc",
                           "/datasets/orc_parser/orc/orc_split_elim.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testStripeLevelStats.orc",
                           "/datasets/orc_parser/orc/smallint_single_col.orc",
                           "/datasets/orc_parser/orc/string_single_col.orc",
                           "/datasets/orc_parser/orc/tinyint_single_col.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testWithoutIndex.orc"]


            for fIndex in range(len(allOrcFiles)):
                url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex])
                tab_test = h2o.import_file(url_orc)
    else:
        raise EnvironmentError
Exemple #34
0
def pubdev_1421():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_airlines_test_file  = "/datasets/airlines.test.csv"

        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_test_file)
        air_test = h2o.import_file(url)
    else:
        raise(EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")
def hdfs_kmeans_converge():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print("Import BigCross.data from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow
        print("rows: {0}".format(n))
        ncent = 3
        miters = 10

        print("Run k-means with k = {0} and max_iterations = {1}".format(
            ncent, miters))
        cross1_km = h2o.kmeans(training_frame=cross_h2o,
                               x=cross_h2o[0:57],
                               k=ncent,
                               max_iterations=miters)
        print(cross1_km)

        print(
            "Run k-means with init = final cluster centers and max_iterations = 1"
        )
        init_centers = h2o.H2OFrame(cross1_km.centers())
        cross2_km = h2o.kmeans(training_frame=cross_h2o,
                               x=cross_h2o[0:57],
                               k=ncent,
                               user_points=init_centers,
                               max_iterations=1)
        print(cross2_km)

        print("Check k-means converged or maximum iterations reached")
        c1 = h2o.H2OFrame(cross1_km.centers())
        c2 = h2o.H2OFrame(cross2_km.centers())
        avg_change = old_div(((c1 - c2)**2).sum(), ncent)
        iters = cross1_km._model_json['output']['model_summary'].cell_values[
            0][3]
        assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \
                                                    "{0} and iterations = {1}".format(avg_change, iters)
    else:
        raise EnvironmentError
def javapredict_gbm_xlarge():

    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hdfs_file_name = "/datasets/z_repro.csv"
    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file_name)

    params = {'ntrees':22, 'max_depth':37, 'min_rows':1, 'sample_rate':0.1} # 651MB pojo
    print("Parameter list:")
    for k,v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k,v))

    train =  h2o.import_file(url)
    test = train[list(range(0,10)),:]
    x = list(range(1,train.ncol))
    y = 0

    pyunit_utils.javapredict("gbm", "numeric", train, test, x, y, **params)
Exemple #37
0
def javapredict_drf_xlarge():

    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hdfs_file_name = "/datasets/z_repro.csv"
    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file_name)

    params = {'ntrees':20, 'max_depth':35, 'min_rows':1} # 739MB pojo
    print "Parameter list:"
    for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v)

    train =  h2o.import_file(url)
    test = train[range(0,10),:]
    x = range(1,train.ncol)
    y = 0

    pyunit_utils.javapredict("random_forest", "numeric", train, test, x, y, **params)
Exemple #38
0
def javapredict_dl_xlarge():

    hdfs_name_node = pyunit_utils.hadoop_namenode()
    hdfs_file_name = "/datasets/z_repro.csv"
    url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file_name)

    params = {'hidden':[3500, 3500], 'epochs':0.0001} # 436MB pojo
    print("Parameter list:")
    for k,v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k,v))

    train =  h2o.import_file(url)
    test = train[list(range(0,10)),:]
    x = list(range(1,train.ncol))
    y = 0

    pyunit_utils.javapredict("deeplearning", "numeric", train, test, x, y, **params)
def pubdev_1431():

    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        airlines_billion_file = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file)
        airlines_billion = h2o.import_file(url)
        airlines_billion[30] = airlines_billion[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1)
        predictions = gbm.predict(airlines_billion)
        csv = os.path.join(os.getcwd(), "delete.csv")
        h2o.download_csv(predictions, csv)
        os.remove(csv)
    else:
        raise (EnvironmentError, "Not running on H2O internal network.  No access to HDFS.")
Exemple #40
0
def pubdev_1431():

    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        airlines_billion_file = "/datasets/airlinesbillion.csv"
        url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file)
        airlines_billion = h2o.import_file(url)
        airlines_billion[30] = airlines_billion[30].asfactor()
        gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1)
        predictions = gbm.predict(airlines_billion)
        csv = os.path.join(os.getcwd(),"delete.csv")
        h2o.download_csv(predictions,csv)
        os.remove(csv)
    else:
        raise EnvironmentError
Exemple #41
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print(
                "Your hive-exec version is too old.  Orc parser test {0} is "
                "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py"))
            pass
        else:
            tol_time = 200  # comparing in ms or ns
            tol_numeric = 1e-5  # tolerance for comparing other numeric fields
            numElements2Compare = 0  # choose number of elements per column to compare.  Save test time.

            hdfs_csv_file1 = "/datasets/orc_parser/csv/balunbal.csv"
            url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file1)
            multi_file_csv1 = h2o.import_file(url_csv1)

            hdfs_csv_file2 = "/datasets/orc_parser/csv/unbalbal.csv"
            url_csv2 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file2)
            multi_file_csv2 = h2o.import_file(url_csv2)

            hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc"

            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            multi_file_orc = h2o.import_file(url_orc)

            # make sure orc multi-file and single big file create same H2O frame
            try:
                assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare,
                                                   tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \
                    "H2O frame parsed from multiple orc and single orc files are different!"
            except:
                assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare,
                                                   tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \
                    "H2O frame parsed from multiple orc and single orc files are different!"
    else:
        raise EnvironmentError
def hdfs_basic():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_iris_dir  = "/datasets/runit/iris_test_train"

        #----------------------------------------------------------------------
        # Single file cases.
        #----------------------------------------------------------------------

        print("Testing single file importHDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_file(url)
        iris_h2o.head()
        iris_h2o.tail()
        n = iris_h2o.nrow
        print("rows: {0}".format(n))
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)
        assert isinstance(iris_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format(type(iris_h2o))
        print("Import worked")

        #----------------------------------------------------------------------
        # Directory file cases.
        #----------------------------------------------------------------------

        print("Testing directory importHDFS")
        urls = ["hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node, hdfs_iris_dir),
                "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node, hdfs_iris_dir)]
        iris_dir_h2o = h2o.import_file(urls)
        iris_dir_h2o.head()
        iris_dir_h2o.tail()
        n = iris_dir_h2o.nrow
        print("rows: {0}".format(n))
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)
        assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\
            format(type(iris_dir_h2o))
        print("Import worked")
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py"))
            pass
        else:
            mix_folder = "/datasets/orc_csv_same_milsongs"
            url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, mix_folder)
            multi_file_mixed = h2o.import_file(url_csv1)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py"))
            pass
        else:
            tol_time = 200              # comparing in ms or ns
            tol_numeric = 1e-5          # tolerance for comparing other numeric fields
            numElements2Compare = 0   # choose number of elements per column to compare.  Save test time.

            hdfs_csv_file1 = "/datasets/orc_parser/csv/balunbal.csv"
            url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file1)
            multi_file_csv1 = h2o.import_file(url_csv1)

            hdfs_csv_file2 = "/datasets/orc_parser/csv/unbalbal.csv"
            url_csv2 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file2)
            multi_file_csv2 = h2o.import_file(url_csv2)

            hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc"

            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            multi_file_orc = h2o.import_file(url_orc)

            # make sure orc multi-file and single big file create same H2O frame
            try:
                assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare,
                                                   tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \
                    "H2O frame parsed from multiple orc and single orc files are different!"
            except:
                assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare,
                                                   tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \
                    "H2O frame parsed from multiple orc and single orc files are different!"
    else:
        raise EnvironmentError
def hdfs_pubdev_3359_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        hdfs_csv_file = "/datasets/PUBDEV-3359"
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        h2oframe_csv = h2o.import_file(url_csv)

        # compare the two frames

        # compare the two frames
        assert h2oframe_csv.nrow == 99998000, "Data should contain 99998000 rows but we parsed: {0} " "rows!".format(
            h2oframe_csv.nrow
        )
    else:
        raise EnvironmentError
def hdfs_import_bigCat():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        numTimes = 10
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        allFiles = ["/datasets/bigCatFiles/tenThousandCat10C.csv", "/datasets/bigCatFiles/hundredThousandCat10C.csv",
                    "/datasets/bigCatFiles/oneMillionCat10C.csv", "/datasets/bigCatFiles/tenThousandCat50C.csv",
                    "/datasets/bigCatFiles/hundredThousandCat50C.csv","/datasets/bigCatFiles/tenThousandCat100C.csv",
                    "/datasets/bigCatFiles/hundredThousandCat100C.csv",
                    "/datasets/bigCatFiles/oneMillionCat50C.csv"]
        reps = [10, 10, 10, 50, 50,  100, 100,50]

        for ind in range(0,len(allFiles)):  # run tests for 3 different sizes per Tomas request
            print("*******  Parsing file {0} ********".format(allFiles[ind]))
            runPerformance("hdfs://{0}{1}".format(hdfs_name_node, allFiles[ind]), numTimes, reps[ind])

    else:
        raise EnvironmentError
def hdfs_kmeans_converge():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_cross_file = "/datasets/runit/BigCross.data"

        print("Import BigCross.data from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file)
        cross_h2o = h2o.import_file(url)
        n = cross_h2o.nrow
        print("rows: {0}".format(n))
        ncent = 3
        miters = 10

        print("Run k-means with k = {0} and max_iterations = {1}".format(ncent,miters))
        cross1_km = h2o.kmeans(training_frame = cross_h2o, x=cross_h2o[0:57], k = ncent, max_iterations = miters)
        print(cross1_km)

        print("Run k-means with init = final cluster centers and max_iterations = 1")
        init_centers = h2o.H2OFrame(cross1_km.centers())
        cross2_km = h2o.kmeans(training_frame = cross_h2o, x=cross_h2o[0:57], k = ncent, user_points=init_centers,
                               max_iterations = 1)
        print(cross2_km)

        print("Check k-means converged or maximum iterations reached")
        c1 = h2o.H2OFrame(cross1_km.centers())
        c2 = h2o.H2OFrame(cross2_km.centers())
        avg_change = old_div(((c1-c2)**2).sum(), ncent)
        iters = cross1_km._model_json['output']['model_summary'].cell_values[0][3]
        assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \
                                                    "{0} and iterations = {1}".format(avg_change, iters)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_timestamp_date_orc.py"))
            pass
        else:
            tol_time = 200              # comparing in ms or ns
            tol_numeric = 1e-5          # tolerance for comparing other numeric fields
            numElements2Compare = 100   # choose number of elements per column to compare.  Save test time.

            allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc",
                           "/datasets/orc_parser/orc/orc_split_elim.orc"]

            allCsvFiles = ["/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv",
                           "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv",
                           "/datasets/orc_parser/csv/orc_split_elim.csv"]

            for fIndex in range(len(allOrcFiles)):
                url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex])
                url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex])
                h2oOrc = h2o.import_file(url_orc)
                h2oCsv = h2o.import_file(url_csv)

                # compare the two frames
                assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                    "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_baddata_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            print("Parsing the orc file {0}".format(url_orc))
            assert pyunit_utils.expect_warnings(url_orc, warn_phrase="UserWarning:",
                                                warn_string_of_interest="Skipping field:", in_hdfs=True,
                                                number_of_times=1), "Expect warnings from orc parser for file "+url_orc+"!"

            hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            print("Parsing the orc file {0}".format(url_orc))
            assert pyunit_utils.expect_warnings(url_orc, warn_phrase="UserWarning:",
                                                warn_string_of_interest="Skipping field:", in_hdfs=True,
                                                number_of_times=1), "Expect warnings from orc parser for file "+url_orc+"!"

            hdfs_orc_file = "/datasets/orc_parser/orc/nulls-at-end-snappy.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            print("Parsing the orc file {0}".format(url_orc))
            assert pyunit_utils.expect_warnings(url_orc, warn_phrase="UserWarning:",
                                                warn_string_of_interest="Long.MIN_VALUE:", in_hdfs=True,
                                                number_of_times=1), "Expect warnings from orc parser for file "+url_orc+"!"

    else:
        raise EnvironmentError
def hdfs_kmeans_airlines():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_file = "/datasets/airlines_all.csv"

        print("Import airlines_all.csv from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file)
        airlines_h2o = h2o.import_file(url)
        n = airlines_h2o.nrow
        print("rows: {0}".format(n))

        print("Run k-means++ with k = 7 and max_iterations = 10")
        myX = list(range(8)) + list(range(11,16)) + list(range(18,21)) + list(range(24,29)) + [9]
        airlines_km = h2o.kmeans(training_frame = airlines_h2o, x = airlines_h2o[myX], k = 7, init = "Furthest",
                                 max_iterations = 10, standardize = True)
        print(airlines_km)
    else:
        raise EnvironmentError
def hdfs_kmeans():
    

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_iris_file = "/datasets/runit/iris_wheader.csv"
        hdfs_covtype_file = "/datasets/runit/covtype.data"

        print("Import iris_wheader.csv from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file)
        iris_h2o = h2o.import_file(url)
        n = iris_h2o.nrow
        print("rows: {0}".format(n))
        assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150)

        print("Running KMeans on iris")
        iris_km = h2o.kmeans(training_frame = iris_h2o, k = 3, x = iris_h2o[0:4], max_iterations = 10)
        print(iris_km)

        print("Importing covtype.data from HDFS")
        url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file)
        covtype_h2o = h2o.import_file(url)
        n = covtype_h2o.nrow
        print("rows: {0}".format(n))
        assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012)

        print("Running KMeans on covtype")
        covtype_km = h2o.kmeans(training_frame = covtype_h2o, x = covtype_h2o[0:55], k = 8, max_iterations = 10)
        print(covtype_km)

    else:
        raise EnvironmentError
def import_folder_orc():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py"))
            pass
        else:

            hdfs_orc_file = "/datasets/orc_parser/prostate_NA.orc"
            hdfs_csv_file = "/datasets/orc_parser/prostate_NA.csv"

    url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)
    url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
    csv = h2o.import_file(url_csv, na_strings=['\\N'])
    multi_file_orc1 = h2o.import_file(url_orc)
    pyunit_utils.compare_frames_local(csv, multi_file_orc1, prob=1)  # should be the same here.

    path = url_orc
    skip_all = list(range(csv.ncol))
    skip_even = list(range(0, csv.ncol, 2))
    skip_odd = list(range(1, csv.ncol, 2))
    skip_start_end = [0, csv.ncol - 1]
    skip_except_last = list(range(0, csv.ncol - 2))
    skip_except_first = list(range(1, csv.ncol))
    temp = list(range(0, csv.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, csv.ncol / 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(csv, path, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        numElements2Compare = 10
        tol_time = 200
        tol_numeric = 1e-5

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py"))
            pass
        else:

            hdfs_orc_file = "/datasets/airlines_all_orc_parts"
            hdfs_csv_file = "/datasets/air_csv_part"

            col_types = ['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'enum', 'real', 'enum', 'real',
                         'real', 'enum', 'real', 'real', 'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real',
                         'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum']

            # import CSV file
            print("Import airlines 116M dataset in original csv format from HDFS")
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            startcsv = time.time()
            multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'], col_types=col_types)
            endcsv = time.time()

            startcsv1 = time.time()
            multi_file_csv1 = h2o.import_file(url_csv)
            endcsv1 = time.time()
            h2o.remove(multi_file_csv1)

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            # import ORC file with same column types as CSV file
            print("Import airlines 116M dataset in ORC format from HDFS")
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)

            startorc1 = time.time()
            multi_file_orc1 = h2o.import_file(url_orc)
            endorc1 = time.time()
            h2o.remove(multi_file_orc1)

            startorc = time.time()
            multi_file_orc = h2o.import_file(url_orc, col_types=col_types)
            endorc = time.time()

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            print("************** CSV (without column type forcing) parse time is {0}".format(endcsv1-startcsv1))
            print("************** CSV (with column type forcing) parse time is {0}".format(endcsv-startcsv))
            print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1))
            print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc))

            # compare frame read by orc by forcing column type,
            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)

    else:
        raise EnvironmentError