def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame( multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_orc_file = "/datasets/orc_parser/air05_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/air05_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(hdfs_csv_file, na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1])) - 1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc, col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame( multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv - startcsv)) print( "************** ORC (without column type forcing) parse time is {0}" .format(endorc1 - startorc1)) print( "************** ORC (with column type forcing) parse time is {0}". format(endorc - startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def import_folder(): tol_time = 200 # comparing in ms or ns for timestamp columns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. multi_file_csv = h2o.import_file(path=pyunit_utils.locate( "smalldata/parser/hexdev_497/airlines_first_header")) multi_file_gzip_comp = \ h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip")) try: # make sure the two agrees assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" except: # in case the files are listed differently, we can always just check to see if the summary agrees. multi_file_gzip_comp.summary() zip_summary = h2o.frame( multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame( multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def import_folder(): tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. # compressed the whole directory of files. multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/parser/hexdev_497/milsongs_csv.zip")) # directory containing the gzip version of csv files here. multi_file_csv = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/parser/hexdev_497/milsongs_csv_gzip")) try: # make sure the two agrees assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" except: # in case the files are listed differently, we can always just check to see if the summary agrees. multi_file_gzip_comp.summary() zip_summary = h2o.frame( multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_airline_05_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/air05_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/air05_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1]))-1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc,col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv-startcsv)) print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def import_folder(): multi_file_csv = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header")) multi_file_gzip_comp = \ h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip")) multi_file_gzip_comp.summary() zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def import_folder(): """ This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv from and build another H2O frame from the multi-file orc parser using multiple orc files that are saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc. It will compare the two frames to make sure they are equal. :return: None if passed. Otherwise, an exception will be thrown. """ startcsv = time.time() multi_file_csv = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/parser/orc/pubdev_3200/air05_csv"), na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1])) - 1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/parser/orc/pubdev_3200/air05_orc")) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/parser/orc/pubdev_3200/air05_orc"), col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv - startcsv)) print("************** ORC (without column type forcing) parse time is {0}". format(endorc1 - startorc1)) print("************** ORC (with column type forcing) parse time is {0}". format(endorc - startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
def import_folder(): multi_file_csv = h2o.import_file(path=pyunit_utils.locate( "smalldata/parser/hexdev_497/airlines_first_header")) multi_file_gzip_comp = \ h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip")) multi_file_gzip_comp.summary() zip_summary = h2o.frame( multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def parquet_parse_simple(): """ Tests Parquet parser by comparing the summary of the original csv frame with the h2o parsed Parquet frame. Basic use case of importing files with auto-detection of column types. :return: None if passed. Otherwise, an exception will be thrown. """ csv = h2o.import_file(path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) parquet = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/parquet/airlines-simple.snappy.parquet")) csv.summary() csv_summary = h2o.frame(csv.frame_id)["frames"][0]["columns"] parquet.summary() parquet_summary = h2o.frame(parquet.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, parquet_summary)
def import_folder(): """ This test will build a H2O frame from importing the bigdata/laptop/parser/orc/airlines_05p_orc_csv from and build another H2O frame from the multi-file orc parser using multiple orc files that are saved in the directory bigdata/laptop/parser/orc/airlines_05p_orc. It will compare the two frames to make sure they are equal. :return: None if passed. Otherwise, an exception will be thrown. """ startcsv = time.time() multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_csv"), na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1]))-1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc")) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/pubdev_3200/air05_orc"), col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv-startcsv)) print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
def import_folder(): """ This test will build a H2O frame from importing the bigdata/laptop/parser/orc/milsongs_orc_csv from and build another H2O frame from the multi-file orc parser using multiple orc files that are saved in the directory bigdata/laptop/parser/orc/milsongs_orc. It will compare the two frames to make sure they are equal. :return: None if passed. Otherwise, an exception will be thrown. """ multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/milsongs_orc_csv")) multi_file_orc = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/orc/milsongs_orc")) multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary)
def parquet_parse_simple(): """ Tests Parquet parser by comparing the summary of the original csv frame with the h2o parsed Parquet frame. Basic use case of importing files with auto-detection of column types. :return: None if passed. Otherwise, an exception will be thrown. """ csv = h2o.import_file( path=pyunit_utils.locate("smalldata/airlines/AirlinesTrain.csv.zip")) parquet = h2o.import_file(path=pyunit_utils.locate( "smalldata/parser/parquet/airlines-simple.snappy.parquet")) csv.summary() csv_summary = h2o.frame(csv.frame_id)["frames"][0]["columns"] parquet.summary() parquet_summary = h2o.frame(parquet.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, parquet_summary)
def import_folder(): tol_time = 200 # comparing in ms or ns for timestamp columns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. multi_file_csv = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header")) multi_file_gzip_comp = \ h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip")) try: # make sure the two agrees assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" except: # in case the files are listed differently, we can always just check to see if the summary agrees. multi_file_gzip_comp.summary() zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def import_folder(): tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. # compressed the whole directory of files. multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/hexdev_497/milsongs_csv.zip")) # directory containing the gzip version of csv files here. multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/hexdev_497/milsongs_csv_gzip")) try: # make sure the two agrees assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" except: # in case the files are listed differently, we can always just check to see if the summary agrees. multi_file_gzip_comp.summary() zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: numElements2Compare = 10 tol_time = 200 tol_numeric = 1e-5 hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_orc_file = "/datasets/airlines_all_orc_parts" hdfs_csv_file = "/datasets/air_csv_part" col_types = ['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'enum', 'real', 'enum', 'real', 'real', 'enum', 'real', 'real', 'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum'] # import CSV file print("Import airlines 116M dataset in original csv format from HDFS") url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'], col_types=col_types) endcsv = time.time() startcsv1 = time.time() multi_file_csv1 = h2o.import_file(url_csv) endcsv1 = time.time() h2o.remove(multi_file_csv1) multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] # import ORC file with same column types as CSV file print("Import airlines 116M dataset in ORC format from HDFS") url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc, col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV (without column type forcing) parse time is {0}".format(endcsv1-startcsv1)) print("************** CSV (with column type forcing) parse time is {0}".format(endcsv-startcsv)) print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: numElements2Compare = 10 tol_time = 200 tol_numeric = 1e-5 hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py")) pass else: hdfs_orc_file = "/datasets/airlines_all_orc_parts" hdfs_csv_file = "/datasets/air_csv_part" col_types = ['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'enum', 'real', 'enum', 'real', 'real', 'enum', 'real', 'real', 'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum'] # import CSV file print("Import airlines 116M dataset in original csv format from HDFS") url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'], col_types=col_types) endcsv = time.time() startcsv1 = time.time() multi_file_csv1 = h2o.import_file(url_csv) endcsv1 = time.time() h2o.remove(multi_file_csv1) multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] # import ORC file with same column types as CSV file print("Import airlines 116M dataset in ORC format from HDFS") url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc, col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV (without column type forcing) parse time is {0}".format(endcsv1-startcsv1)) print("************** CSV (with column type forcing) parse time is {0}".format(endcsv-startcsv)) print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError