コード例 #1
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
          "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/milsongs_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/milsongs_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            multi_file_csv = h2o.import_file(url_csv)
            multi_file_orc = h2o.import_file(url_orc)

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
コード例 #2
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_orc_file = "/datasets/orc_parser/milsongs_orc"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        hdfs_csv_file = "/datasets/orc_parser/milsongs_csv"
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        multi_file_csv = h2o.import_file(url_csv)
        multi_file_orc = h2o.import_file(url_orc)

        multi_file_csv.summary()
        csv_summary = h2o.frame(
            multi_file_csv.frame_id)["frames"][0]["columns"]

        multi_file_orc.summary()
        orc_summary = h2o.frame(
            multi_file_orc.frame_id)["frames"][0]["columns"]

        pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        hdfs_orc_file = "/datasets/orc_parser/air05_orc"
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
        hdfs_csv_file = "/datasets/orc_parser/air05_csv"
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        startcsv = time.time()
        multi_file_csv = h2o.import_file(hdfs_csv_file, na_strings=['\\N'])
        endcsv = time.time()

        csv_type_dict = multi_file_csv.types

        multi_file_csv.summary()
        csv_summary = h2o.frame(
            multi_file_csv.frame_id)["frames"][0]["columns"]

        col_ind_name = dict()
        # change column types from real to enum according to multi_file_csv column types
        for key_name in list(csv_type_dict):
            col_ind = key_name.split('C')
            new_ind = int(str(col_ind[1])) - 1
            col_ind_name[new_ind] = key_name

        col_types = []
        for ind in range(len(col_ind_name)):
            col_types.append(csv_type_dict[col_ind_name[ind]])

        startorc1 = time.time()
        multi_file_orc1 = h2o.import_file(url_orc)
        endorc1 = time.time()
        h2o.remove(multi_file_orc1)

        startorc = time.time()
        multi_file_orc = h2o.import_file(url_orc, col_types=col_types)
        endorc = time.time()

        multi_file_orc.summary()
        orc_summary = h2o.frame(
            multi_file_orc.frame_id)["frames"][0]["columns"]

        print("************** CSV parse time is {0}".format(endcsv - startcsv))
        print(
            "************** ORC (without column type forcing) parse time is {0}"
            .format(endorc1 - startorc1))
        print(
            "************** ORC (with column type forcing) parse time is {0}".
            format(endorc - startorc))
        # compare frame read by orc by forcing column type,
        pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
コード例 #4
0
def import_folder():

    tol_time = 200  # comparing in ms or ns for timestamp columns
    tol_numeric = 1e-5  # tolerance for comparing other numeric fields
    numElements2Compare = 0  # choose number of elements per column to compare.  Save test time.

    multi_file_csv = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/parser/hexdev_497/airlines_first_header"))
    multi_file_gzip_comp = \
      h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip"))

    try:
        # make sure the two agrees
        assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time,
                                           tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \
                                                               "files are different!"
    except:  # in case the files are listed differently, we can always just check to see if the summary agrees.
        multi_file_gzip_comp.summary()
        zip_summary = h2o.frame(
            multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

        multi_file_csv.summary()
        csv_summary = h2o.frame(
            multi_file_csv.frame_id)["frames"][0]["columns"]
        pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
コード例 #5
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible(
    )

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(
                hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/milsongs_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/milsongs_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            multi_file_csv = h2o.import_file(url_csv)
            multi_file_orc = h2o.import_file(url_orc)

            multi_file_csv.summary()
            csv_summary = h2o.frame(
                multi_file_csv.frame_id)["frames"][0]["columns"]

            multi_file_orc.summary()
            orc_summary = h2o.frame(
                multi_file_orc.frame_id)["frames"][0]["columns"]

            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
コード例 #6
0
def import_folder():

    tol_time = 200  # comparing in ms or ns
    tol_numeric = 1e-5  # tolerance for comparing other numeric fields
    numElements2Compare = 100  # choose number of elements per column to compare.  Save test time.

    # compressed the whole directory of files.
    multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/parser/hexdev_497/milsongs_csv.zip"))

    # directory containing the gzip version of csv files here.
    multi_file_csv = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/parser/hexdev_497/milsongs_csv_gzip"))

    try:
        # make sure the two agrees
        assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time,
                                           tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \
                                                               "files are different!"
    except:  # in case the files are listed differently, we can always just check to see if the summary agrees.
        multi_file_gzip_comp.summary()
        zip_summary = h2o.frame(
            multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

        multi_file_csv.summary()
        csv_summary = h2o.frame(
            multi_file_csv.frame_id)["frames"][0]["columns"]
        pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_airline_05_orc.py"))
            pass
        else:
            hdfs_orc_file = "/datasets/orc_parser/air05_orc"
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)
            hdfs_csv_file = "/datasets/orc_parser/air05_csv"
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            startcsv = time.time()
            multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'])
            endcsv = time.time()

            csv_type_dict = multi_file_csv.types

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            col_ind_name = dict()
            # change column types from real to enum according to multi_file_csv column types
            for key_name in list(csv_type_dict):
                col_ind = key_name.split('C')
                new_ind = int(str(col_ind[1]))-1
                col_ind_name[new_ind] = key_name

            col_types = []
            for ind in range(len(col_ind_name)):
                col_types.append(csv_type_dict[col_ind_name[ind]])

            startorc1 = time.time()
            multi_file_orc1 = h2o.import_file(url_orc)
            endorc1 = time.time()
            h2o.remove(multi_file_orc1)

            startorc = time.time()
            multi_file_orc = h2o.import_file(url_orc,col_types=col_types)
            endorc = time.time()

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            print("************** CSV parse time is {0}".format(endcsv-startcsv))
            print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1))
            print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc))
            # compare frame read by orc by forcing column type,
            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
    else:
        raise EnvironmentError
def import_folder():
  multi_file_csv = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header"))
  multi_file_gzip_comp = \
    h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip"))

  multi_file_gzip_comp.summary()
  zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

  multi_file_csv.summary()
  csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]
  pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
コード例 #9
0
def import_folder():
    """
    This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv
    from and build another H2O frame from the multi-file orc parser using multiple orc files that are
    saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc.  It will compare the two frames
    to make sure they are equal.
    :return: None if passed.  Otherwise, an exception will be thrown.
    """
    startcsv = time.time()
    multi_file_csv = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/parser/orc/pubdev_3200/air05_csv"),
                                     na_strings=['\\N'])
    endcsv = time.time()

    csv_type_dict = multi_file_csv.types

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

    col_ind_name = dict()
    # change column types from real to enum according to multi_file_csv column types
    for key_name in list(csv_type_dict):
        col_ind = key_name.split('C')
        new_ind = int(str(col_ind[1])) - 1
        col_ind_name[new_ind] = key_name

    col_types = []
    for ind in range(len(col_ind_name)):
        col_types.append(csv_type_dict[col_ind_name[ind]])

    startorc1 = time.time()
    multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/parser/orc/pubdev_3200/air05_orc"))
    endorc1 = time.time()
    h2o.remove(multi_file_orc1)

    startorc = time.time()
    multi_file_orc = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/parser/orc/pubdev_3200/air05_orc"),
                                     col_types=col_types)
    endorc = time.time()

    multi_file_orc.summary()
    orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

    print("************** CSV parse time is {0}".format(endcsv - startcsv))
    print("************** ORC (without column type forcing) parse time is {0}".
          format(endorc1 - startorc1))
    print("************** ORC (with column type forcing) parse time is {0}".
          format(endorc - startorc))
    # compare frame read by orc by forcing column type,
    pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
def import_folder():
    multi_file_csv = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/parser/hexdev_497/airlines_first_header"))
    multi_file_gzip_comp = \
      h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip"))

    multi_file_gzip_comp.summary()
    zip_summary = h2o.frame(
        multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]
    pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
コード例 #11
0
def parquet_parse_simple():
    """
    Tests Parquet parser by comparing the summary of the original csv frame with the h2o parsed Parquet frame.
    Basic use case of importing files with auto-detection of column types.
    :return: None if passed.  Otherwise, an exception will be thrown.
    """
    csv = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    parquet = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/airlines-simple.snappy.parquet"))

    csv.summary()
    csv_summary = h2o.frame(csv.frame_id)["frames"][0]["columns"]

    parquet.summary()
    parquet_summary = h2o.frame(parquet.frame_id)["frames"][0]["columns"]

    pyunit_utils.compare_frame_summary(csv_summary, parquet_summary)
def import_folder():
    """
    This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv
    from and build another H2O frame from the multi-file orc parser using multiple orc files that are
    saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc.  It will compare the two frames
    to make sure they are equal.
    :return: None if passed.  Otherwise, an exception will be thrown.
    """
    startcsv = time.time()
    multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_csv"),
                                     na_strings=['\\N'])
    endcsv = time.time()

    csv_type_dict = multi_file_csv.types

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

    col_ind_name = dict()
    # change column types from real to enum according to multi_file_csv column types
    for key_name in list(csv_type_dict):
        col_ind = key_name.split('C')
        new_ind = int(str(col_ind[1]))-1
        col_ind_name[new_ind] = key_name

    col_types = []
    for ind in range(len(col_ind_name)):
        col_types.append(csv_type_dict[col_ind_name[ind]])

    startorc1 = time.time()
    multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc"))
    endorc1 = time.time()
    h2o.remove(multi_file_orc1)

    startorc = time.time()
    multi_file_orc = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc"),
                                     col_types=col_types)
    endorc = time.time()

    multi_file_orc.summary()
    orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

    print("************** CSV parse time is {0}".format(endcsv-startcsv))
    print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1))
    print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc))
    # compare frame read by orc by forcing column type,
    pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
コード例 #13
0
def import_folder():
    """
    This test will build a H2O frame from importing the bigdata/laptop/parser/orc/milsongs_orc_csv
    from and build another H2O frame from the multi-file orc parser using multiple orc files that are
    saved in the directory bigdata/laptop/parser/orc/milsongs_orc.  It will compare the two frames
    to make sure they are equal.
    :return: None if passed.  Otherwise, an exception will be thrown.
    """
    multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/milsongs_orc_csv"))
    multi_file_orc = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/milsongs_orc"))

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

    multi_file_orc.summary()
    orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

    pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
コード例 #14
0
def parquet_parse_simple():
    """
    Tests Parquet parser by comparing the summary of the original csv frame with the h2o parsed Parquet frame.
    Basic use case of importing files with auto-detection of column types.
    :return: None if passed.  Otherwise, an exception will be thrown.
    """
    csv = h2o.import_file(
        path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip"))
    parquet = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/parser/parquet/airlines-simple.snappy.parquet"))

    csv.summary()
    csv_summary = h2o.frame(csv.frame_id)["frames"][0]["columns"]

    parquet.summary()
    parquet_summary = h2o.frame(parquet.frame_id)["frames"][0]["columns"]

    pyunit_utils.compare_frame_summary(csv_summary, parquet_summary)
def import_folder():

  tol_time = 200              # comparing in ms or ns for timestamp columns
  tol_numeric = 1e-5          # tolerance for comparing other numeric fields
  numElements2Compare = 0   # choose number of elements per column to compare.  Save test time.

  multi_file_csv = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header"))
  multi_file_gzip_comp = \
    h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip"))

  try:
    # make sure the two agrees
    assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time,
                                       tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \
                                                           "files are different!"
  except: # in case the files are listed differently, we can always just check to see if the summary agrees.
    multi_file_gzip_comp.summary()
    zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]
    pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
コード例 #16
0
def import_folder():

  tol_time = 200              # comparing in ms or ns
  tol_numeric = 1e-5          # tolerance for comparing other numeric fields
  numElements2Compare = 100   # choose number of elements per column to compare.  Save test time.

  # compressed the whole directory of files.
  multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/hexdev_497/milsongs_csv.zip"))

  # directory containing the gzip version of csv files here.
  multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/hexdev_497/milsongs_csv_gzip"))

  try:
    # make sure the two agrees
    assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time,
                                       tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \
                                                           "files are different!"
  except: # in case the files are listed differently, we can always just check to see if the summary agrees.
    multi_file_gzip_comp.summary()
    zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"]

    multi_file_csv.summary()
    csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]
    pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
コード例 #17
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        numElements2Compare = 10
        tol_time = 200
        tol_numeric = 1e-5

        hdfs_name_node = pyunit_utils.hadoop_namenode()
        hdfs_orc_file = "/datasets/airlines_all_orc_parts"
        hdfs_csv_file = "/datasets/air_csv_part"

        col_types = ['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'enum', 'real', 'enum', 'real',
                     'real', 'enum', 'real', 'real', 'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real',
                     'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum']

        # import CSV file
        print("Import airlines 116M dataset in original csv format from HDFS")
        url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

        startcsv = time.time()
        multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'], col_types=col_types)
        endcsv = time.time()

        startcsv1 = time.time()
        multi_file_csv1 = h2o.import_file(url_csv)
        endcsv1 = time.time()
        h2o.remove(multi_file_csv1)

        multi_file_csv.summary()
        csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

        # import ORC file with same column types as CSV file
        print("Import airlines 116M dataset in ORC format from HDFS")
        url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)

        startorc1 = time.time()
        multi_file_orc1 = h2o.import_file(url_orc)
        endorc1 = time.time()
        h2o.remove(multi_file_orc1)

        startorc = time.time()
        multi_file_orc = h2o.import_file(url_orc, col_types=col_types)
        endorc = time.time()

        multi_file_orc.summary()
        orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

        print("************** CSV (without column type forcing) parse time is {0}".format(endcsv1-startcsv1))
        print("************** CSV (with column type forcing) parse time is {0}".format(endcsv-startcsv))
        print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1))
        print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc))

    # compare frame read by orc by forcing column type,
        pyunit_utils.compare_frame_summary(csv_summary, orc_summary)

    else:
        raise EnvironmentError
コード例 #18
0
def hdfs_orc_parser():

    # Check if we are running inside the H2O network by seeing if we can touch
    # the namenode.
    hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible()

    if hadoop_namenode_is_accessible:
        numElements2Compare = 10
        tol_time = 200
        tol_numeric = 1e-5

        hdfs_name_node = pyunit_utils.hadoop_namenode()

        if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"):
            print("Your hive-exec version is too old.  Orc parser test {0} is "
                  "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py"))
            pass
        else:

            hdfs_orc_file = "/datasets/airlines_all_orc_parts"
            hdfs_csv_file = "/datasets/air_csv_part"

            col_types = ['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'enum', 'real', 'enum', 'real',
                         'real', 'enum', 'real', 'real', 'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real',
                         'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum']

            # import CSV file
            print("Import airlines 116M dataset in original csv format from HDFS")
            url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file)

            startcsv = time.time()
            multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'], col_types=col_types)
            endcsv = time.time()

            startcsv1 = time.time()
            multi_file_csv1 = h2o.import_file(url_csv)
            endcsv1 = time.time()
            h2o.remove(multi_file_csv1)

            multi_file_csv.summary()
            csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"]

            # import ORC file with same column types as CSV file
            print("Import airlines 116M dataset in ORC format from HDFS")
            url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file)

            startorc1 = time.time()
            multi_file_orc1 = h2o.import_file(url_orc)
            endorc1 = time.time()
            h2o.remove(multi_file_orc1)

            startorc = time.time()
            multi_file_orc = h2o.import_file(url_orc, col_types=col_types)
            endorc = time.time()

            multi_file_orc.summary()
            orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"]

            print("************** CSV (without column type forcing) parse time is {0}".format(endcsv1-startcsv1))
            print("************** CSV (with column type forcing) parse time is {0}".format(endcsv-startcsv))
            print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1))
            print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc))

            # compare frame read by orc by forcing column type,
            pyunit_utils.compare_frame_summary(csv_summary, orc_summary)

    else:
        raise EnvironmentError