def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
          "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/milsongs_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/milsongs_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            multi_file_csv = h2o.import_file(url_csv)
            multi_file_orc = h2o.import_file(url_orc)

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        # run a quick test to determine if the hive-exec is too old.

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_prostate_orc.py"))
            pass
        else:

            tol_time = 200              # comparing in ms or ns
            tol_numeric = 1e-5          # tolerance for comparing other numeric fields
            numElements2Compare = 10   # choose number of elements per column to compare.  Save test time.

            hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc"
            hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            h2oOrc = h2o.import_file(url_orc)
            h2oCsv = h2o.import_file(url_csv)

            # compare the two frames
            assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_iris_import_types_orc.py"))
            pass
        else:

            numElements2Compare = 100
            tol_time = 200
            tol_numeric = 1e-5

            hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            h2oframe_csv = h2o.import_file(url_csv)
            data_types = ['real', 'real', 'real', 'real', 'enum']
            h2oframe_orc = h2o.import_file(url_orc, col_types = data_types)

            # compare the two frames
            assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric,
                                               True), "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
Example #4
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/milsongs_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/milsongs_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            multi_file_csv = h2o.import_file(url_csv)
            multi_file_orc = h2o.import_file(url_orc)

            multi_file_csv.summary()
            csv_summary = h2o.frame(
                multi_file_csv.frame_id)["frames"][0]["columns"]

            multi_file_orc.summary()
            orc_summary = h2o.frame(
                multi_file_orc.frame_id)["frames"][0]["columns"]

            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_airline_05_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/air05_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/air05_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            startcsv = time.time()
            multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'])
            endcsv = time.time()

            csv_type_dict = multi_file_csv.types

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            col_ind_name = dict()
            # change column types from real to enum according to multi_file_csv column types
            for key_name in list(csv_type_dict):
                col_ind = key_name.split('C')
                new_ind = int(str(col_ind[1]))-1
                col_ind_name[new_ind] = key_name

            col_types = []
            for ind in range(len(col_ind_name)):
                col_types.append(csv_type_dict[col_ind_name[ind]])

            startorc1 = time.time()
            multi_file_orc1 = h2o.import_file(url_orc)
            endorc1 = time.time()
            h2o.remove(multi_file_orc1)

            startorc = time.time()
            multi_file_orc = h2o.import_file(url_orc,col_types=col_types)
            endorc = time.time()

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            print("************** CSV parse time is {0}".format(endcsv-startcsv))
            print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1))
            print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc))
            # compare frame read by orc by forcing column type,
            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        numElements2Compare = 10
        tol_time = 200
        tol_numeric = 1e-5

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_orc_parser.py"))
            pass
        else:

            allOrcFiles = [
                "/datasets/orc_parser/orc/TestOrcFile.columnProjection.orc",
                "/datasets/orc_parser/orc/bigint_single_col.orc",
                "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc",
                "/datasets/orc_parser/orc/bool_single_col.orc",
                "/datasets/orc_parser/orc/demo-11-zlib.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc",
                "/datasets/orc_parser/orc/demo-12-zlib.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc",
                "/datasets/orc_parser/orc/double_single_col.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV11.orc",
                "/datasets/orc_parser/orc/float_single_col.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV12.orc",
                "/datasets/orc_parser/orc/int_single_col.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testPredicatePushdown.orc",
                "/datasets/orc_parser/orc/nulls-at-end-snappy.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testSnappy.orc",
                "/datasets/orc_parser/orc/orc_split_elim.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testStripeLevelStats.orc",
                "/datasets/orc_parser/orc/smallint_single_col.orc",
                "/datasets/orc_parser/orc/string_single_col.orc",
                "/datasets/orc_parser/orc/tinyint_single_col.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testWithoutIndex.orc"
            ]

            for fIndex in range(len(allOrcFiles)):
                url_orc = "hdfs://{0}{1}".format(hdfs_name_node,
                                                 allOrcFiles[fIndex])
                tab_test = h2o.import_file(url_orc)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format(
                      "pyunit_INTERNAL_HDFS_timestamp_date_orc.py"))
            pass
        else:
            origTZ = h2o.cluster().timezone
            newZone = 'America/Los_Angeles'
            h2o.cluster().timezone = newZone
            tol_time = 200  # comparing in ms or ns
            tol_numeric = 1e-5  # tolerance for comparing other numeric fields
            numElements2Compare = 100  # choose number of elements per column to compare.  Save test time.

            allOrcFiles = [
                "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc",
                "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc",
                "/datasets/orc_parser/orc/orc_split_elim.orc"
            ]

            allCsvFiles = [
                "/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv",
                "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv",
                "/datasets/orc_parser/csv/orc_split_elim.csv"
            ]

            for fIndex in range(len(allOrcFiles)):
                url_orc = "hdfs://{0}{1}".format(hdfs_name_node,
                                                 allOrcFiles[fIndex])
                url_csv = "hdfs://{0}{1}".format(hdfs_name_node,
                                                 allCsvFiles[fIndex])
                h2oOrc = h2o.import_file(url_orc)
                h2oCsv = h2o.import_file(url_csv)

                # compare the two frames
                assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                    "H2O frame parsed from orc and csv files are different!"

            h2o.cluster().timezone = origTZ
    else:
        raise EnvironmentError
Example #8
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_baddata_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            print("Parsing the orc file {0}".format(url_orc))
            assert pyunit_utils.expect_warnings(
                url_orc,
                warn_phrase="UserWarning:",
                warn_string_of_interest="Skipping field:",
                in_hdfs=True,
                number_of_times=1
            ), "Expect warnings from orc parser for file " + url_orc + "!"

            hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            print("Parsing the orc file {0}".format(url_orc))
            assert pyunit_utils.expect_warnings(
                url_orc,
                warn_phrase="UserWarning:",
                warn_string_of_interest="Skipping field:",
                in_hdfs=True,
                number_of_times=1
            ), "Expect warnings from orc parser for file " + url_orc + "!"

            hdfs_orc_file = "/datasets/orc_parser/orc/nulls-at-end-snappy.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            print("Parsing the orc file {0}".format(url_orc))
            assert pyunit_utils.expect_warnings(
                url_orc,
                warn_phrase="UserWarning:",
                warn_string_of_interest="Long.MIN_VALUE:",
                in_hdfs=True,
                number_of_times=1
            ), "Expect warnings from orc parser for file " + url_orc + "!"

    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        numElements2Compare = 10
        tol_time = 200
        tol_numeric = 1e-5

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_orc_parser.py"))
            pass
        else:

            allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.columnProjection.orc",
                           "/datasets/orc_parser/orc/bigint_single_col.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc",
                           "/datasets/orc_parser/orc/bool_single_col.orc",
                           "/datasets/orc_parser/orc/demo-11-zlib.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc",
                           "/datasets/orc_parser/orc/demo-12-zlib.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc",
                           "/datasets/orc_parser/orc/double_single_col.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV11.orc",
                           "/datasets/orc_parser/orc/float_single_col.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV12.orc",
                           "/datasets/orc_parser/orc/int_single_col.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testPredicatePushdown.orc",
                           "/datasets/orc_parser/orc/nulls-at-end-snappy.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testSnappy.orc",
                           "/datasets/orc_parser/orc/orc_split_elim.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testStripeLevelStats.orc",
                           "/datasets/orc_parser/orc/smallint_single_col.orc",
                           "/datasets/orc_parser/orc/string_single_col.orc",
                           "/datasets/orc_parser/orc/tinyint_single_col.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testWithoutIndex.orc"]


            for fIndex in range(len(allOrcFiles)):
                url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex])
                tab_test = h2o.import_file(url_orc)
    else:
        raise EnvironmentError
Example #10
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print(
                "Your hive-exec version is too old.  Orc parser test {0} is "
                "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py"))
            pass
        else:
            tol_time = 200  # comparing in ms or ns
            tol_numeric = 1e-5  # tolerance for comparing other numeric fields
            numElements2Compare = 0  # choose number of elements per column to compare.  Save test time.

            hdfs_csv_file1 = "/datasets/orc_parser/csv/balunbal.csv"
            url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file1)
            multi_file_csv1 = h2o.import_file(url_csv1)

            hdfs_csv_file2 = "/datasets/orc_parser/csv/unbalbal.csv"
            url_csv2 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file2)
            multi_file_csv2 = h2o.import_file(url_csv2)

            hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc"

            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            multi_file_orc = h2o.import_file(url_orc)

            # make sure orc multi-file and single big file create same H2O frame
            try:
                assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare,
                                                   tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \
                    "H2O frame parsed from multiple orc and single orc files are different!"
            except:
                assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare,
                                                   tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \
                    "H2O frame parsed from multiple orc and single orc files are different!"
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py"))
            pass
        else:
            mix_folder = "/datasets/orc_csv_same_milsongs"
            url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, mix_folder)
            multi_file_mixed = h2o.import_file(url_csv1)
    else:
        raise EnvironmentError
Example #12
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py"))
            pass
        else:
            mix_folder = "/datasets/milsongs_orc_air_csv"
            url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, mix_folder)
            multi_file_mixed = h2o.import_file(url_csv1)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py"))
            pass
        else:
            tol_time = 200              # comparing in ms or ns
            tol_numeric = 1e-5          # tolerance for comparing other numeric fields
            numElements2Compare = 0   # choose number of elements per column to compare.  Save test time.

            hdfs_csv_file1 = "/datasets/orc_parser/csv/balunbal.csv"
            url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file1)
            multi_file_csv1 = h2o.import_file(url_csv1)

            hdfs_csv_file2 = "/datasets/orc_parser/csv/unbalbal.csv"
            url_csv2 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file2)
            multi_file_csv2 = h2o.import_file(url_csv2)

            hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc"

            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            multi_file_orc = h2o.import_file(url_orc)

            # make sure orc multi-file and single big file create same H2O frame
            try:
                assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare,
                                                   tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \
                    "H2O frame parsed from multiple orc and single orc files are different!"
            except:
                assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare,
                                                   tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \
                    "H2O frame parsed from multiple orc and single orc files are different!"
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_timestamp_date_orc.py"))
            pass
        else:
            tol_time = 200              # comparing in ms or ns
            tol_numeric = 1e-5          # tolerance for comparing other numeric fields
            numElements2Compare = 100   # choose number of elements per column to compare.  Save test time.

            allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc",
                           "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc",
                           "/datasets/orc_parser/orc/orc_split_elim.orc"]

            allCsvFiles = ["/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv",
                           "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv",
                           "/datasets/orc_parser/csv/orc_split_elim.csv"]

            for fIndex in range(len(allOrcFiles)):
                url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex])
                url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex])
                h2oOrc = h2o.import_file(url_orc)
                h2oCsv = h2o.import_file(url_csv)

                # compare the two frames
                assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                    "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_baddata_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            print("Parsing the orc file {0}".format(url_orc))
            assert pyunit_utils.expect_warnings(url_orc, warn_phrase="UserWarning:",
                                                warn_string_of_interest="Skipping field:", in_hdfs=True,
                                                number_of_times=1), "Expect warnings from orc parser for file "+url_orc+"!"

            hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            print("Parsing the orc file {0}".format(url_orc))
            assert pyunit_utils.expect_warnings(url_orc, warn_phrase="UserWarning:",
                                                warn_string_of_interest="Skipping field:", in_hdfs=True,
                                                number_of_times=1), "Expect warnings from orc parser for file "+url_orc+"!"

            hdfs_orc_file = "/datasets/orc_parser/orc/nulls-at-end-snappy.orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            print("Parsing the orc file {0}".format(url_orc))
            assert pyunit_utils.expect_warnings(url_orc, warn_phrase="UserWarning:",
                                                warn_string_of_interest="Long.MIN_VALUE:", in_hdfs=True,
                                                number_of_times=1), "Expect warnings from orc parser for file "+url_orc+"!"

    else:
        raise EnvironmentError
Example #16
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        # run a quick test to determine if the hive-exec is too old.

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_prostate_orc.py"))
            pass
        else:

            tol_time = 200  # comparing in ms or ns
            tol_numeric = 1e-5  # tolerance for comparing other numeric fields
            numElements2Compare = 10  # choose number of elements per column to compare.  Save test time.

            hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc"
            hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            h2oOrc = h2o.import_file(url_orc)
            h2oCsv = h2o.import_file(url_csv)

            # compare the two frames
            assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \
                "H2O frame parsed from orc and csv files are different!"
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        numElements2Compare = 10
        tol_time = 200
        tol_numeric = 1e-5

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py"))
            pass
        else:

            hdfs_orc_file = "/datasets/airlines_all_orc_parts"
            hdfs_csv_file = "/datasets/air_csv_part"

            col_types = [
                'real', 'real', 'real', 'real', 'real', 'real', 'real', 'real',
                'enum', 'real', 'enum', 'real', 'real', 'enum', 'real', 'real',
                'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real',
                'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum'
            ]

            # import CSV file
            print(
                "Import airlines 116M dataset in original csv format from HDFS"
            )
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            startcsv = time.time()
            multi_file_csv = h2o.import_file(url_csv,
                                             na_strings=['\\N'],
                                             col_types=col_types)
            endcsv = time.time()

            startcsv1 = time.time()
            multi_file_csv1 = h2o.import_file(url_csv)
            endcsv1 = time.time()
            h2o.remove(multi_file_csv1)

            multi_file_csv.summary()
            csv_summary = h2o.frame(
                multi_file_csv.frame_id)["frames"][0]["columns"]

            # import ORC file with same column types as CSV file
            print("Import airlines 116M dataset in ORC format from HDFS")
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)

            startorc1 = time.time()
            multi_file_orc1 = h2o.import_file(url_orc)
            endorc1 = time.time()
            h2o.remove(multi_file_orc1)

            startorc = time.time()
            multi_file_orc = h2o.import_file(url_orc, col_types=col_types)
            endorc = time.time()

            multi_file_orc.summary()
            orc_summary = h2o.frame(
                multi_file_orc.frame_id)["frames"][0]["columns"]

            print(
                "************** CSV (without column type forcing) parse time is {0}"
                .format(endcsv1 - startcsv1))
            print(
                "************** CSV (with column type forcing) parse time is {0}"
                .format(endcsv - startcsv))
            print(
                "************** ORC (without column type forcing) parse time is {0}"
                .format(endorc1 - startorc1))
            print(
                "************** ORC (with column type forcing) parse time is {0}"
                .format(endorc - startorc))

            # compare frame read by orc by forcing column type,
            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)

    else:
        raise EnvironmentError
def import_folder_orc():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py"))
            pass
        else:

            hdfs_orc_file = "/datasets/orc_parser/prostate_NA.orc"
            hdfs_csv_file = "/datasets/orc_parser/prostate_NA.csv"

    url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)
    url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
    csv = h2o.import_file(url_csv, na_strings=['\\N'])
    multi_file_orc1 = h2o.import_file(url_orc)
    pyunit_utils.compare_frames_local(csv, multi_file_orc1, prob=1)  # should be the same here.

    path = url_orc
    skip_all = list(range(csv.ncol))
    skip_even = list(range(0, csv.ncol, 2))
    skip_odd = list(range(1, csv.ncol, 2))
    skip_start_end = [0, csv.ncol - 1]
    skip_except_last = list(range(0, csv.ncol - 2))
    skip_except_first = list(range(1, csv.ncol))
    temp = list(range(0, csv.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, csv.ncol / 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(csv, path, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_random)
Example #19
0
def import_folder_orc():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py"))
            pass
        else:

            hdfs_orc_file = "/datasets/orc_parser/prostate_NA.orc"
            hdfs_csv_file = "/datasets/orc_parser/prostate_NA.csv"

    url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)
    url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
    csv = h2o.import_file(url_csv, na_strings=['\\N'])
    multi_file_orc1 = h2o.import_file(url_orc)
    pyunit_utils.compare_frames_local(csv, multi_file_orc1,
                                      prob=1)  # should be the same here.

    path = url_orc
    skip_all = list(range(csv.ncol))
    skip_even = list(range(0, csv.ncol, 2))
    skip_odd = list(range(1, csv.ncol, 2))
    skip_start_end = [0, csv.ncol - 1]
    skip_except_last = list(range(0, csv.ncol - 2))
    skip_except_first = list(range(1, csv.ncol))
    temp = list(range(0, csv.ncol))
    random.shuffle(temp)
    skip_random = []
    for index in range(0, csv.ncol / 2):
        skip_random.append(temp[index])
    skip_random.sort()

    try:
        loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    try:
        importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all)
        sys.exit(1)  # should have failed here
    except:
        pass

    # skip even columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_even)

    # skip odd columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_odd)

    # skip the very beginning and the very end.
    pyunit_utils.checkCorrectSkips(csv, path, skip_start_end)

    # skip all except the last column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_last)

    # skip all except the very first column
    pyunit_utils.checkCorrectSkips(csv, path, skip_except_first)

    # randomly skipped half the columns
    pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        numElements2Compare = 10
        tol_time = 200
        tol_numeric = 1e-5

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py"))
            pass
        else:

            hdfs_orc_file = "/datasets/airlines_all_orc_parts"
            hdfs_csv_file = "/datasets/air_csv_part"

            col_types = ['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'enum', 'real', 'enum', 'real',
                         'real', 'enum', 'real', 'real', 'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real',
                         'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum']

            # import CSV file
            print("Import airlines 116M dataset in original csv format from HDFS")
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            startcsv = time.time()
            multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'], col_types=col_types)
            endcsv = time.time()

            startcsv1 = time.time()
            multi_file_csv1 = h2o.import_file(url_csv)
            endcsv1 = time.time()
            h2o.remove(multi_file_csv1)

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            # import ORC file with same column types as CSV file
            print("Import airlines 116M dataset in ORC format from HDFS")
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)

            startorc1 = time.time()
            multi_file_orc1 = h2o.import_file(url_orc)
            endorc1 = time.time()
            h2o.remove(multi_file_orc1)

            startorc = time.time()
            multi_file_orc = h2o.import_file(url_orc, col_types=col_types)
            endorc = time.time()

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            print("************** CSV (without column type forcing) parse time is {0}".format(endcsv1-startcsv1))
            print("************** CSV (with column type forcing) parse time is {0}".format(endcsv-startcsv))
            print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1))
            print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc))

            # compare frame read by orc by forcing column type,
            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)

    else:
        raise EnvironmentError