def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_iris_import_types_orc.py")) pass else: numElements2Compare = 100 tol_time = 200 tol_numeric = 1e-5 hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oframe_csv = h2o.import_file(url_csv) data_types = ['real', 'real', 'real', 'real', 'enum'] h2oframe_orc = h2o.import_file(url_orc, col_types = data_types) # compare the two frames assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. hdfs_csv_file = "/datasets/orc_parser/synthetic_perfect_separation_csv" hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) # make sure orc multi-file and single big file create same H2O frame assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv, numElements2Compare, tol_time, tol_numeric,True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" else: raise EnvironmentError
def xgboost_estimation(): if ("XGBoost" not in h2o.cluster().list_all_extensions()): print("XGBoost extension is not present. Skipping test. . .") return # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if not hadoop_namenode_is_accessible: raise EnvironmentError("Hadoop namenode is not accessible") hdfs_name_node = pyunit_utils.hadoop_namenode() full_data = createData(500000, 500) myX = list(full_data.col_names) myX.remove("IsDepDelayed") xgb = H2OXGBoostEstimator(seed=42, tree_method="approx") xgboost_model = xgb.train(y="IsDepDelayed", x=myX[0:480], training_frame=full_data, model_id="xgboost") print(xgboost_model) pred = predict(xgboost_model, full_data) perf = h2o.performance(xgboost_model, full_data) return perf
def createData(nrows, ncols): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_airlines_file = "/datasets/airlines_all.05p.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_file) airlines = h2o.import_file(url) myX = ["Year", "Month", "DayofMonth", "DayOfWeek", "Distance"] myY = "IsDepDelayed" allCols = list(myX) allCols.append(myY) airlines = airlines[allCols] num_new_features = ncols - airlines.ncol sample_data = h2o.create_frame(rows=nrows, cols=num_new_features, categorical_fraction=0, seed=1234, seed_for_column_types=1234) new_rows = nrows - airlines.nrow if (nrows > 0): extra_rows = airlines[0:nrows, :] airlines = airlines.rbind(extra_rows) airlines = airlines[0:nrows, :] full_data = airlines.cbind(sample_data) return full_data
def test_frame_reload(self): name_node = pyunit_utils.hadoop_namenode() work_dir = "hdfs://%s%s" % (name_node, utils.get_workdir()) dataset = "/datasets/mnist/train.csv.gz" try: cluster_1 = utils.start_cluster("saver") h2o.connect(url=cluster_1) df_orig = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset)) df_key = df_orig.key df_pd_orig = df_orig.as_data_frame() df_orig.save(work_dir) h2o.connection().close() finally: utils.stop_cluster("saver") try: cluster_2 = utils.start_cluster("loader") h2o.connect(url=cluster_2) df_loaded = h2o.load_frame(df_key, work_dir) df_pd_loaded = df_loaded.as_data_frame() h2o.connection().close() finally: utils.stop_cluster("loader") self.assertTrue(df_pd_orig.equals(df_pd_loaded))
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() # run a quick test to determine if the hive-exec is too old. if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_prostate_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 10 # choose number of elements per column to compare. Save test time. hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", "/datasets/orc_parser/orc/orc_split_elim.orc"] allCsvFiles = ["/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv", "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv", "/datasets/orc_parser/csv/orc_split_elim.csv"] for fIndex in range(len(allOrcFiles)): url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex]) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() numElements2Compare = 100 tol_time = 200 tol_numeric = 1e-5 hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oframe_csv = h2o.import_file(url_csv) data_types = ['real', 'real', 'real', 'real', 'enum'] h2oframe_orc = h2o.import_file(url_orc, col_types = data_types) # compare the two frames assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def hive_import(): hdfs_name_node = pyunit_utils.hadoop_namenode() hive_host = os.getenv("HIVE_HOST") connection_url = "jdbc:hive2://{0}:10000/default".format(hive_host) krb_enabled = os.getenv('KRB_ENABLED', 'false').lower() == 'true' if krb_enabled: connection_url += ";auth=delegationToken" # read original file_url = "hdfs://{0}{1}".format( hdfs_name_node, "/user/jenkins/smalldata/chicago/chicagoCensus.csv") dataset_original = h2o.import_file(file_url) # read TABLE from Hive JDBC table_jdbc = h2o.import_sql_table(connection_url, "chicago", "", "", fetch_mode="SINGLE") table_jdbc = adapt_frame(table_jdbc, column_prefix="chicago.") pyunit_utils.compare_frames_local(dataset_original, table_jdbc, prob=1) # read TABLE from Hive FS table_direct = h2o.import_hive_table(connection_url, "chicago") table_direct = adapt_frame(table_direct) pyunit_utils.compare_frames_local(dataset_original, table_direct, prob=1)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame( multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def xgb_repro(): name_node = pyunit_utils.hadoop_namenode() data = h2o.import_file( "hdfs://" + name_node + "/user/jenkins/bigdata/laptop/airlinesBillion_7Columns_5GB.csv", na_strings=["NA"]) train, test = data.split_frame(ratios=[0.99], seed=1) x = data.names y = "C31" x.remove(y) model = H2OXGBoostEstimator(ntrees=5, max_depth=6, learn_rate=0.1, seed=12345, backend="CPU") model.train(x=x, y=y, training_frame=train) p1 = model.predict(test) model.train(x=x, y=y, training_frame=train) p2 = model.predict(test) p = p1.cbind(p2) diff = (p[1] != p[4]).as_data_frame() ndiffs = 0 for i in range(len(diff) - 1): if diff.iat[i, 0] != 0: ndiffs += 1 assert ndiffs == 0, "diffs %d out of %d rows" % (ndiffs, p1.nrows)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) assert pyunit_utils.expect_warnings(url_orc, "UserWarning:", "Skipping field:", 1),\ "Expect warnings from orc parser for file "+url_orc+"!" hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) assert pyunit_utils.expect_warnings(url_orc, "UserWarning:", "Skipping field:", 1), \ "Expect warnings from orc parser for file "+url_orc+"!" hdfs_orc_file = "/datasets/orc_parser/orc/nulls-at-end-snappy.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) assert pyunit_utils.expect_warnings(url_orc, "UserWarning:", "Skipping field:", 1), \ "Expect warnings from orc parser for file "+url_orc+"!" else: raise EnvironmentError
def hdfs_kmeans(): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_covtype_file = "/datasets/runit/covtype.data" print("Import iris_wheader.csv from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) n = iris_h2o.nrow print("rows: {0}".format(n)) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150) print("Running KMeans on iris") iris_km = H2OKMeansEstimator(k=3, training_frame=iris_h2o[0:4], max_iterations=10) iris_km.train() print(iris_km) print("Importing covtype.data from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file) covtype_h2o = h2o.import_file(url) n = covtype_h2o.nrow print("rows: {0}".format(n)) assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012) print("Running KMeans on covtype") covtype_km = H2OKMeansEstimator(training_frame=covtype_h2o[0:55], k=8, max_iterations=10) covtype_km.train() print(covtype_km)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_milsongs_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/milsongs_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/milsongs_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] multi_file_orc.summary() orc_summary = h2o.frame( multi_file_orc.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_orc_file = "/datasets/orc_parser/air05_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/air05_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(hdfs_csv_file, na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1])) - 1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc, col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame( multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv - startcsv)) print( "************** ORC (without column type forcing) parse time is {0}" .format(endorc1 - startorc1)) print( "************** ORC (with column type forcing) parse time is {0}". format(endorc - startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_import_bigCat(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: numTimes = 10 hdfs_name_node = pyunit_utils.hadoop_namenode() allFiles = [ "/datasets/bigCatFiles/tenThousandCat10C.csv", "/datasets/bigCatFiles/hundredThousandCat10C.csv", "/datasets/bigCatFiles/oneMillionCat10C.csv", "/datasets/bigCatFiles/tenThousandCat50C.csv", "/datasets/bigCatFiles/hundredThousandCat50C.csv", "/datasets/bigCatFiles/tenThousandCat100C.csv", "/datasets/bigCatFiles/hundredThousandCat100C.csv", "/datasets/bigCatFiles/oneMillionCat50C.csv" ] reps = [10, 10, 10, 50, 50, 100, 100, 50] for ind in range( 0, len(allFiles )): # run tests for 3 different sizes per Tomas request print("******* Parsing file {0} ********".format(allFiles[ind])) runPerformance( "hdfs://{0}{1}".format(hdfs_name_node, allFiles[ind]), numTimes, reps[ind]) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 10 # choose number of elements per column to compare. Save test time. hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def hdfs_kmeans_airlines(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_file = "/datasets/airlines_all.csv" print "Import airlines_all.csv from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file) airlines_h2o = h2o.import_file(url) n = airlines_h2o.nrow print "rows: {0}".format(n) print "Run k-means++ with k = 7 and max_iterations = 10" myX = range(8) + range(11, 16) + range(18, 21) + range(24, 29) + [9] airlines_km = h2o.kmeans(training_frame=airlines_h2o, x=airlines_h2o[myX], k=7, init="Furthest", max_iterations=10, standardize=True) print airlines_km else: raise (EnvironmentError, "Not running on H2O internal network. No access to HDFS.")
def xgboost_estimation(): if ("XGBoost" not in h2o.cluster().list_all_extensions()): print("XGBoost extension is not present. Skipping test. . .") return # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if not hadoop_namenode_is_accessible: raise EnvironmentError("Hadoop namenode is not accessible") hdfs_name_node = pyunit_utils.hadoop_namenode() full_data = createData(500000, 500) myX = list(full_data.col_names) myX.remove("IsDepDelayed") xgb = H2OXGBoostEstimator(seed = 42, tree_method = "approx") xgboost_model = xgb.train(y = "IsDepDelayed", x = myX[0:480], training_frame = full_data, model_id = "xgboost") print(xgboost_model) pred = predict(xgboost_model, full_data) perf = h2o.performance(xgboost_model, full_data) return perf
def test_frame_reload(self): name_node = pyunit_utils.hadoop_namenode() work_dir = "hdfs://%s%s" % (name_node, utils.get_workdir()) dataset = "/datasets/iris_wheader.csv" ntrees_opts = [100, 120, 130, 140] learn_rate_opts = [0.01, 0.02, 0.03, 0.04] grid_size = len(ntrees_opts) * len(learn_rate_opts) print("max models %s" % grid_size) grid_id = "grid_ft_resume" hyper_parameters = { "learn_rate": learn_rate_opts, "ntrees": ntrees_opts } cluster_1_name = "grid1-py" try: cluster_1 = utils.start_cluster(cluster_1_name) h2o.connect(url=cluster_1) train = h2o.import_file(path="hdfs://%s%s" % (name_node, dataset)) grid = H2OGridSearch(H2OGradientBoostingEstimator, grid_id=grid_id, hyper_params=hyper_parameters, recovery_dir=work_dir) print("starting initial grid and sleeping...") grid.start(x=list(range(4)), y=4, training_frame=train) grid_in_progress = None times_waited = 0 while (times_waited < 20) and (grid_in_progress is None or len( grid_in_progress.model_ids) == 0): time.sleep(5) # give it tome to train some models times_waited += 1 try: grid_in_progress = h2o.get_grid(grid_id) except IndexError: print("no models trained yet") print("done sleeping") h2o.connection().close() finally: utils.stop_cluster(cluster_1_name) cluster_2_name = "grid2-py" try: cluster_2 = utils.start_cluster(cluster_2_name) h2o.connect(url=cluster_2) loaded = h2o.load_grid("%s/%s" % (work_dir, grid_id), load_params_references=True) print("models after first run:") for x in sorted(loaded.model_ids): print(x) loaded.resume() print("models after second run:") for x in sorted(loaded.model_ids): print(x) print("Newly grained grid has %d models" % len(loaded.model_ids)) self.assertEqual(len(loaded.model_ids), grid_size, "The full grid was not trained.") h2o.connection().close() finally: utils.stop_cluster(cluster_2_name)
def createData(nrows, ncols): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_airlines_file = "/datasets/airlines_all.05p.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_file) airlines = h2o.import_file(url) myX = ["Year", "Month", "DayofMonth", "DayOfWeek", "Distance"] myY = "IsDepDelayed" allCols = list(myX) allCols.append(myY) airlines = airlines[allCols] num_new_features = ncols - airlines.ncol sample_data = h2o.create_frame(rows = nrows, cols = num_new_features, categorical_fraction = 0, seed = 1234, seed_for_column_types = 1234) new_rows = nrows - airlines.nrow if (nrows > 0): extra_rows = airlines[0:nrows, : ] airlines = airlines.rbind(extra_rows) airlines = airlines[0:nrows, : ] full_data = airlines.cbind(sample_data) return full_data
def directory_import(): hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() url1 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/iris1.csv") url2 = "hdfs://{0}{1}".format(hdfs_name_node, "/datasets/iris/identical_iris_files/") print "Importing HDFS file {0} and directory {1}".format(url1, url2) frm_one = h2o.import_file(url1) frm_all = h2o.import_file(url2) r1, c1 = frm_one.dim ra, ca = frm_all.dim assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra) assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca) else: raise(EnvironmentError, "Not running on H2O internal network. No access to HDFS.") small1 = pyunit_utils.locate("smalldata/jira/identical_files/iris1.csv") small2 = small1.split("iris1.csv")[0] print "Importing smalldata file {0} and directory {1}".format(small1, small2) frm_one = h2o.import_file(small1) frm_all = h2o.import_file(small2) r1, c1 = frm_one.dim ra, ca = frm_all.dim assert r1*3 == ra, "Expected 3 times the rows, but got {0} and {1}".format(r1,ra) assert c1 == ca, "Expected same number of cols, but got {0} and {1}".format(c1,ca)
def hdfs_basic(): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_iris_dir = "/datasets/runit/iris_test_train" print("Testing single file importHDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) n = iris_h2o.nrow print("rows: {0}".format(n)) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 150) assert isinstance( iris_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format( type(iris_h2o)) print("Import worked") print("Testing directory importHDFS") urls = [ "hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node, hdfs_iris_dir), "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node, hdfs_iris_dir) ] iris_dir_h2o = h2o.import_file(urls) iris_dir_h2o.head() iris_dir_h2o.tail() n = iris_dir_h2o.nrow print("rows: {0}".format(n)) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 150) assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\ format(type(iris_dir_h2o)) print("Import worked")
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_airline_05_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/air05_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/air05_csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N']) endcsv = time.time() csv_type_dict = multi_file_csv.types multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] col_ind_name = dict() # change column types from real to enum according to multi_file_csv column types for key_name in list(csv_type_dict): col_ind = key_name.split('C') new_ind = int(str(col_ind[1]))-1 col_ind_name[new_ind] = key_name col_types = [] for ind in range(len(col_ind_name)): col_types.append(csv_type_dict[col_ind_name[ind]]) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc,col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV parse time is {0}".format(endcsv-startcsv)) print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: numElements2Compare = 10 tol_time = 200 tol_numeric = 1e-5 hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_csv_file = "/datasets/air_csv_part" col_types = ['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'enum', 'real', 'enum', 'real', 'real', 'enum', 'real', 'real', 'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum'] # import CSV file print("Import airlines 116M dataset in original csv format from HDFS") url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) acs_orig = h2o.import_file(url_csv, na_strings=['\\N'], col_types=col_types) print("Data size number of rows: {0}, number of columns: {1}".format(acs_orig.nrow, acs_orig.ncol)) seeds = [2297378124, 3849570216, 6733652048, 8915337442, 8344418400, 9416580152, 2598632624, 4977008454, 8273228579, 8185554539, 3219125000, 2998879373, 7707012513, 5786923379, 5029788935, 935945790, 7092607078, 9305834745, 6173975590, 5397294255] run_time_ms = [] iterations = [] objective = [] num_runs = 1 # number of times to repeat experiments for ind in range(num_runs): acs_model = H2OGeneralizedLowRankEstimator(k = 10, transform = 'STANDARDIZE', loss = 'Quadratic', multi_loss="Categorical", model_id="clients_core_glrm", regularization_x="L2", regularization_y="L1", gamma_x=0.2, gamma_y=0.5, init="SVD", max_iterations = 200, seed=seeds[ind % len(seeds)]) acs_model.train(x = acs_orig.names, training_frame= acs_orig, seed=seeds[ind % len(seeds)]) run_time_ms.append(acs_model._model_json['output']['end_time'] - acs_model._model_json['output']['start_time']) iterations.append(acs_model._model_json['output']['iterations']) objective.append(acs_model._model_json['output']['objective']) print("Run time in ms: {0}".format(run_time_ms)) print("number of iterations: {0}".format(iterations)) print("objective function value: {0}".format(objective)) sys.stdout.flush() else: raise EnvironmentError
def kmeans_mllib(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_cross_file = "/datasets/runit/BigCross.data" print("Import BigCross.data from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow err_mllib = np.genfromtxt( pyunit_utils.locate("smalldata/mllib_bench/bigcross_wcsse.csv"), delimiter=",", skip_header=1) ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))] for k in ncent: print( "Run k-means++ with k = {0} and max_iterations = 10".format(k)) cross_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o, k=k, init="PlusPlus", max_iterations=10, standardize=False) clust_mllib = np.genfromtxt( pyunit_utils.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) + ".csv"), delimiter=",").tolist() clust_h2o = cross_km.centers() # Sort in ascending order by first dimension for comparison purposes clust_mllib.sort(key=lambda x: x[0]) clust_h2o.sort(key=lambda x: x[0]) print("\nMLlib Cluster Centers:\n") print(clust_mllib) print("\nH2O Cluster Centers:\n") print(clust_h2o) wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1] wcsse_h2o = old_div(cross_km.tot_withinss(), n) print("\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib)) print("H2O Average Within-Cluster SSE: \n".format(wcsse_h2o)) assert wcsse_h2o == wcsse_mllib, "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " \ "got {1}".format(wcsse_mllib, wcsse_h2o) else: raise EnvironmentError
def hdfs_basic(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_iris_dir = "/datasets/runit/iris_test_train" #---------------------------------------------------------------------- # Single file cases. #---------------------------------------------------------------------- print "Testing single file importHDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) iris_h2o.head() iris_h2o.tail() n = iris_h2o.nrow print "rows: {0}".format(n) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 150) assert isinstance( iris_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format( type(iris_h2o)) print "Import worked" #---------------------------------------------------------------------- # Directory file cases. #---------------------------------------------------------------------- print "Testing directory importHDFS" urls = [ "hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node, hdfs_iris_dir), "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node, hdfs_iris_dir) ] iris_dir_h2o = h2o.import_file(urls) iris_dir_h2o.head() iris_dir_h2o.tail() n = iris_dir_h2o.nrow print "rows: {0}".format(n) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format( n, 150) assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\ format(type(iris_dir_h2o)) print "Import worked" else: raise (EnvironmentError, "Not running on H2O internal network. No access to HDFS.")
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: numElements2Compare = 10 tol_time = 200 tol_numeric = 1e-5 hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_orc_parser.py")) pass else: allOrcFiles = [ "/datasets/orc_parser/orc/TestOrcFile.columnProjection.orc", "/datasets/orc_parser/orc/bigint_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc", "/datasets/orc_parser/orc/bool_single_col.orc", "/datasets/orc_parser/orc/demo-11-zlib.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", "/datasets/orc_parser/orc/demo-12-zlib.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", "/datasets/orc_parser/orc/double_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV11.orc", "/datasets/orc_parser/orc/float_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV12.orc", "/datasets/orc_parser/orc/int_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testPredicatePushdown.orc", "/datasets/orc_parser/orc/nulls-at-end-snappy.orc", "/datasets/orc_parser/orc/TestOrcFile.testSnappy.orc", "/datasets/orc_parser/orc/orc_split_elim.orc", "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc", "/datasets/orc_parser/orc/TestOrcFile.testStripeLevelStats.orc", "/datasets/orc_parser/orc/smallint_single_col.orc", "/datasets/orc_parser/orc/string_single_col.orc", "/datasets/orc_parser/orc/tinyint_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testWithoutIndex.orc" ] for fIndex in range(len(allOrcFiles)): url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) tab_test = h2o.import_file(url_orc) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format( "pyunit_INTERNAL_HDFS_timestamp_date_orc.py")) pass else: origTZ = h2o.cluster().timezone newZone = 'America/Los_Angeles' h2o.cluster().timezone = newZone tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. allOrcFiles = [ "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", "/datasets/orc_parser/orc/orc_split_elim.orc" ] allCsvFiles = [ "/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv", "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv", "/datasets/orc_parser/csv/orc_split_elim.csv" ] for fIndex in range(len(allOrcFiles)): url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex]) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" h2o.cluster().timezone = origTZ else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_baddata_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) print("Parsing the orc file {0}".format(url_orc)) assert pyunit_utils.expect_warnings( url_orc, warn_phrase="UserWarning:", warn_string_of_interest="Skipping field:", in_hdfs=True, number_of_times=1 ), "Expect warnings from orc parser for file " + url_orc + "!" hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) print("Parsing the orc file {0}".format(url_orc)) assert pyunit_utils.expect_warnings( url_orc, warn_phrase="UserWarning:", warn_string_of_interest="Skipping field:", in_hdfs=True, number_of_times=1 ), "Expect warnings from orc parser for file " + url_orc + "!" hdfs_orc_file = "/datasets/orc_parser/orc/nulls-at-end-snappy.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) print("Parsing the orc file {0}".format(url_orc)) assert pyunit_utils.expect_warnings( url_orc, warn_phrase="UserWarning:", warn_string_of_interest="Long.MIN_VALUE:", in_hdfs=True, number_of_times=1 ), "Expect warnings from orc parser for file " + url_orc + "!" else: raise EnvironmentError
def kmeans_mllib(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_cross_file = "/datasets/runit/BigCross.data" print "Import BigCross.data from HDFS" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow err_mllib = np.genfromtxt( pyunit_utils.locate("smalldata/mllib_bench/bigcross_wcsse.csv"), delimiter=",", skip_header=1 ) ncent = [int(err_mllib[r][0]) for r in range(len(err_mllib))] for k in ncent: print "Run k-means++ with k = {0} and max_iterations = 10".format(k) cross_km = h2o.kmeans( training_frame=cross_h2o, x=cross_h2o, k=k, init="PlusPlus", max_iterations=10, standardize=False ) clust_mllib = np.genfromtxt( pyunit_utils.locate("smalldata/mllib_bench/bigcross_centers_" + str(k) + ".csv"), delimiter="," ).tolist() clust_h2o = cross_km.centers() # Sort in ascending order by first dimension for comparison purposes clust_mllib.sort(key=lambda x: x[0]) clust_h2o.sort(key=lambda x: x[0]) print "\nMLlib Cluster Centers:\n" print clust_mllib print "\nH2O Cluster Centers:\n" print clust_h2o wcsse_mllib = err_mllib[err_mllib[0:4, 0].tolist().index(k)][1] wcsse_h2o = cross_km.tot_withinss() / n print "\nMLlib Average Within-Cluster SSE: \n".format(wcsse_mllib) print "H2O Average Within-Cluster SSE: \n".format(wcsse_h2o) assert wcsse_h2o == wcsse_mllib, ( "Expected mllib and h2o to get the same wcsse. Mllib got {0}, and H2O " "got {1}".format(wcsse_mllib, wcsse_h2o) ) else: raise (EnvironmentError, "Not running on H2O internal network. No access to HDFS.")
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: numElements2Compare = 10 tol_time = 200 tol_numeric = 1e-5 hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_orc_parser.py")) pass else: allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.columnProjection.orc", "/datasets/orc_parser/orc/bigint_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc", "/datasets/orc_parser/orc/bool_single_col.orc", "/datasets/orc_parser/orc/demo-11-zlib.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", "/datasets/orc_parser/orc/demo-12-zlib.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", "/datasets/orc_parser/orc/double_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV11.orc", "/datasets/orc_parser/orc/float_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testMemoryManagementV12.orc", "/datasets/orc_parser/orc/int_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testPredicatePushdown.orc", "/datasets/orc_parser/orc/nulls-at-end-snappy.orc", "/datasets/orc_parser/orc/TestOrcFile.testSnappy.orc", "/datasets/orc_parser/orc/orc_split_elim.orc", "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc", "/datasets/orc_parser/orc/TestOrcFile.testStripeLevelStats.orc", "/datasets/orc_parser/orc/smallint_single_col.orc", "/datasets/orc_parser/orc/string_single_col.orc", "/datasets/orc_parser/orc/tinyint_single_col.orc", "/datasets/orc_parser/orc/TestOrcFile.testWithoutIndex.orc"] for fIndex in range(len(allOrcFiles)): url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) tab_test = h2o.import_file(url_orc) else: raise EnvironmentError
def pubdev_1421(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_airlines_test_file = "/datasets/airlines.test.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_airlines_test_file) air_test = h2o.import_file(url) else: raise(EnvironmentError, "Not running on H2O internal network. No access to HDFS.")
def hdfs_kmeans_converge(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_cross_file = "/datasets/runit/BigCross.data" print("Import BigCross.data from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow print("rows: {0}".format(n)) ncent = 3 miters = 10 print("Run k-means with k = {0} and max_iterations = {1}".format( ncent, miters)) cross1_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, max_iterations=miters) print(cross1_km) print( "Run k-means with init = final cluster centers and max_iterations = 1" ) init_centers = h2o.H2OFrame(cross1_km.centers()) cross2_km = h2o.kmeans(training_frame=cross_h2o, x=cross_h2o[0:57], k=ncent, user_points=init_centers, max_iterations=1) print(cross2_km) print("Check k-means converged or maximum iterations reached") c1 = h2o.H2OFrame(cross1_km.centers()) c2 = h2o.H2OFrame(cross2_km.centers()) avg_change = old_div(((c1 - c2)**2).sum(), ncent) iters = cross1_km._model_json['output']['model_summary'].cell_values[ 0][3] assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \ "{0} and iterations = {1}".format(avg_change, iters) else: raise EnvironmentError
def javapredict_gbm_xlarge(): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_file_name = "/datasets/z_repro.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file_name) params = {'ntrees':22, 'max_depth':37, 'min_rows':1, 'sample_rate':0.1} # 651MB pojo print("Parameter list:") for k,v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k,v)) train = h2o.import_file(url) test = train[list(range(0,10)),:] x = list(range(1,train.ncol)) y = 0 pyunit_utils.javapredict("gbm", "numeric", train, test, x, y, **params)
def javapredict_drf_xlarge(): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_file_name = "/datasets/z_repro.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file_name) params = {'ntrees':20, 'max_depth':35, 'min_rows':1} # 739MB pojo print "Parameter list:" for k,v in zip(params.keys(), params.values()): print "{0}, {1}".format(k,v) train = h2o.import_file(url) test = train[range(0,10),:] x = range(1,train.ncol) y = 0 pyunit_utils.javapredict("random_forest", "numeric", train, test, x, y, **params)
def javapredict_dl_xlarge(): hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_file_name = "/datasets/z_repro.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file_name) params = {'hidden':[3500, 3500], 'epochs':0.0001} # 436MB pojo print("Parameter list:") for k,v in zip(list(params.keys()), list(params.values())): print("{0}, {1}".format(k,v)) train = h2o.import_file(url) test = train[list(range(0,10)),:] x = list(range(1,train.ncol)) y = 0 pyunit_utils.javapredict("deeplearning", "numeric", train, test, x, y, **params)
def pubdev_1431(): hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() airlines_billion_file = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file) airlines_billion = h2o.import_file(url) airlines_billion[30] = airlines_billion[30].asfactor() gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion) csv = os.path.join(os.getcwd(), "delete.csv") h2o.download_csv(predictions, csv) os.remove(csv) else: raise (EnvironmentError, "Not running on H2O internal network. No access to HDFS.")
def pubdev_1431(): hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() airlines_billion_file = "/datasets/airlinesbillion.csv" url = "hdfs://{0}{1}".format(hdfs_name_node, airlines_billion_file) airlines_billion = h2o.import_file(url) airlines_billion[30] = airlines_billion[30].asfactor() gbm = h2o.gbm(x=airlines_billion[0:30], y=airlines_billion[30], ntrees=1, distribution="bernoulli", max_depth=1) predictions = gbm.predict(airlines_billion) csv = os.path.join(os.getcwd(),"delete.csv") h2o.download_csv(predictions,csv) os.remove(csv) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print( "Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. hdfs_csv_file1 = "/datasets/orc_parser/csv/balunbal.csv" url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file1) multi_file_csv1 = h2o.import_file(url_csv1) hdfs_csv_file2 = "/datasets/orc_parser/csv/unbalbal.csv" url_csv2 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file2) multi_file_csv2 = h2o.import_file(url_csv2) hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) multi_file_orc = h2o.import_file(url_orc) # make sure orc multi-file and single big file create same H2O frame try: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare, tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \ "H2O frame parsed from multiple orc and single orc files are different!" except: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare, tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \ "H2O frame parsed from multiple orc and single orc files are different!" else: raise EnvironmentError
def hdfs_basic(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_iris_dir = "/datasets/runit/iris_test_train" #---------------------------------------------------------------------- # Single file cases. #---------------------------------------------------------------------- print("Testing single file importHDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) iris_h2o.head() iris_h2o.tail() n = iris_h2o.nrow print("rows: {0}".format(n)) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150) assert isinstance(iris_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".format(type(iris_h2o)) print("Import worked") #---------------------------------------------------------------------- # Directory file cases. #---------------------------------------------------------------------- print("Testing directory importHDFS") urls = ["hdfs://{0}{1}/iris_test.csv".format(hdfs_name_node, hdfs_iris_dir), "hdfs://{0}{1}/iris_train.csv".format(hdfs_name_node, hdfs_iris_dir)] iris_dir_h2o = h2o.import_file(urls) iris_dir_h2o.head() iris_dir_h2o.tail() n = iris_dir_h2o.nrow print("rows: {0}".format(n)) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150) assert isinstance(iris_dir_h2o, h2o.H2OFrame), "Wrong type. Expected H2OFrame, but got {0}".\ format(type(iris_dir_h2o)) print("Import worked") else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py")) pass else: mix_folder = "/datasets/orc_csv_same_milsongs" url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, mix_folder) multi_file_mixed = h2o.import_file(url_csv1) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. hdfs_csv_file1 = "/datasets/orc_parser/csv/balunbal.csv" url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file1) multi_file_csv1 = h2o.import_file(url_csv1) hdfs_csv_file2 = "/datasets/orc_parser/csv/unbalbal.csv" url_csv2 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file2) multi_file_csv2 = h2o.import_file(url_csv2) hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) multi_file_orc = h2o.import_file(url_orc) # make sure orc multi-file and single big file create same H2O frame try: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare, tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \ "H2O frame parsed from multiple orc and single orc files are different!" except: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare, tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \ "H2O frame parsed from multiple orc and single orc files are different!" else: raise EnvironmentError
def hdfs_pubdev_3359_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_csv_file = "/datasets/PUBDEV-3359" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oframe_csv = h2o.import_file(url_csv) # compare the two frames # compare the two frames assert h2oframe_csv.nrow == 99998000, "Data should contain 99998000 rows but we parsed: {0} " "rows!".format( h2oframe_csv.nrow ) else: raise EnvironmentError
def hdfs_import_bigCat(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: numTimes = 10 hdfs_name_node = pyunit_utils.hadoop_namenode() allFiles = ["/datasets/bigCatFiles/tenThousandCat10C.csv", "/datasets/bigCatFiles/hundredThousandCat10C.csv", "/datasets/bigCatFiles/oneMillionCat10C.csv", "/datasets/bigCatFiles/tenThousandCat50C.csv", "/datasets/bigCatFiles/hundredThousandCat50C.csv","/datasets/bigCatFiles/tenThousandCat100C.csv", "/datasets/bigCatFiles/hundredThousandCat100C.csv", "/datasets/bigCatFiles/oneMillionCat50C.csv"] reps = [10, 10, 10, 50, 50, 100, 100,50] for ind in range(0,len(allFiles)): # run tests for 3 different sizes per Tomas request print("******* Parsing file {0} ********".format(allFiles[ind])) runPerformance("hdfs://{0}{1}".format(hdfs_name_node, allFiles[ind]), numTimes, reps[ind]) else: raise EnvironmentError
def hdfs_kmeans_converge(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_cross_file = "/datasets/runit/BigCross.data" print("Import BigCross.data from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_cross_file) cross_h2o = h2o.import_file(url) n = cross_h2o.nrow print("rows: {0}".format(n)) ncent = 3 miters = 10 print("Run k-means with k = {0} and max_iterations = {1}".format(ncent,miters)) cross1_km = h2o.kmeans(training_frame = cross_h2o, x=cross_h2o[0:57], k = ncent, max_iterations = miters) print(cross1_km) print("Run k-means with init = final cluster centers and max_iterations = 1") init_centers = h2o.H2OFrame(cross1_km.centers()) cross2_km = h2o.kmeans(training_frame = cross_h2o, x=cross_h2o[0:57], k = ncent, user_points=init_centers, max_iterations = 1) print(cross2_km) print("Check k-means converged or maximum iterations reached") c1 = h2o.H2OFrame(cross1_km.centers()) c2 = h2o.H2OFrame(cross2_km.centers()) avg_change = old_div(((c1-c2)**2).sum(), ncent) iters = cross1_km._model_json['output']['model_summary'].cell_values[0][3] assert avg_change < 1e-6 or iters > miters, "Expected k-means to converge or reach max iterations. avg_change = " \ "{0} and iterations = {1}".format(avg_change, iters) else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_timestamp_date_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", "/datasets/orc_parser/orc/orc_split_elim.orc"] allCsvFiles = ["/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv", "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv", "/datasets/orc_parser/csv/orc_split_elim.csv"] for fIndex in range(len(allOrcFiles)): url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex]) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_baddata_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.testStringAndBinaryStatistics.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) print("Parsing the orc file {0}".format(url_orc)) assert pyunit_utils.expect_warnings(url_orc, warn_phrase="UserWarning:", warn_string_of_interest="Skipping field:", in_hdfs=True, number_of_times=1), "Expect warnings from orc parser for file "+url_orc+"!" hdfs_orc_file = "/datasets/orc_parser/orc/TestOrcFile.emptyFile.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) print("Parsing the orc file {0}".format(url_orc)) assert pyunit_utils.expect_warnings(url_orc, warn_phrase="UserWarning:", warn_string_of_interest="Skipping field:", in_hdfs=True, number_of_times=1), "Expect warnings from orc parser for file "+url_orc+"!" hdfs_orc_file = "/datasets/orc_parser/orc/nulls-at-end-snappy.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) print("Parsing the orc file {0}".format(url_orc)) assert pyunit_utils.expect_warnings(url_orc, warn_phrase="UserWarning:", warn_string_of_interest="Long.MIN_VALUE:", in_hdfs=True, number_of_times=1), "Expect warnings from orc parser for file "+url_orc+"!" else: raise EnvironmentError
def hdfs_kmeans_airlines(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_file = "/datasets/airlines_all.csv" print("Import airlines_all.csv from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_file) airlines_h2o = h2o.import_file(url) n = airlines_h2o.nrow print("rows: {0}".format(n)) print("Run k-means++ with k = 7 and max_iterations = 10") myX = list(range(8)) + list(range(11,16)) + list(range(18,21)) + list(range(24,29)) + [9] airlines_km = h2o.kmeans(training_frame = airlines_h2o, x = airlines_h2o[myX], k = 7, init = "Furthest", max_iterations = 10, standardize = True) print(airlines_km) else: raise EnvironmentError
def hdfs_kmeans(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() hdfs_iris_file = "/datasets/runit/iris_wheader.csv" hdfs_covtype_file = "/datasets/runit/covtype.data" print("Import iris_wheader.csv from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_iris_file) iris_h2o = h2o.import_file(url) n = iris_h2o.nrow print("rows: {0}".format(n)) assert n == 150, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 150) print("Running KMeans on iris") iris_km = h2o.kmeans(training_frame = iris_h2o, k = 3, x = iris_h2o[0:4], max_iterations = 10) print(iris_km) print("Importing covtype.data from HDFS") url = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_covtype_file) covtype_h2o = h2o.import_file(url) n = covtype_h2o.nrow print("rows: {0}".format(n)) assert n == 581012, "Wrong number of rows. Got {0}. Should have got {1}".format(n, 581012) print("Running KMeans on covtype") covtype_km = h2o.kmeans(training_frame = covtype_h2o, x = covtype_h2o[0:55], k = 8, max_iterations = 10) print(covtype_km) else: raise EnvironmentError
def import_folder_orc(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py")) pass else: hdfs_orc_file = "/datasets/orc_parser/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/prostate_NA.csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) csv = h2o.import_file(url_csv, na_strings=['\\N']) multi_file_orc1 = h2o.import_file(url_orc) pyunit_utils.compare_frames_local(csv, multi_file_orc1, prob=1) # should be the same here. path = url_orc skip_all = list(range(csv.ncol)) skip_even = list(range(0, csv.ncol, 2)) skip_odd = list(range(1, csv.ncol, 2)) skip_start_end = [0, csv.ncol - 1] skip_except_last = list(range(0, csv.ncol - 2)) skip_except_first = list(range(1, csv.ncol)) temp = list(range(0, csv.ncol)) random.shuffle(temp) skip_random = [] for index in range(0, csv.ncol / 2): skip_random.append(temp[index]) skip_random.sort() try: loadFileSkipAll = h2o.upload_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass try: importFileSkipAll = h2o.import_file(path, skipped_columns=skip_all) sys.exit(1) # should have failed here except: pass # skip even columns pyunit_utils.checkCorrectSkips(csv, path, skip_even) # skip odd columns pyunit_utils.checkCorrectSkips(csv, path, skip_odd) # skip the very beginning and the very end. pyunit_utils.checkCorrectSkips(csv, path, skip_start_end) # skip all except the last column pyunit_utils.checkCorrectSkips(csv, path, skip_except_last) # skip all except the very first column pyunit_utils.checkCorrectSkips(csv, path, skip_except_first) # randomly skipped half the columns pyunit_utils.checkCorrectSkips(csv, path, skip_random)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: numElements2Compare = 10 tol_time = 200 tol_numeric = 1e-5 hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_airlines_orc.py")) pass else: hdfs_orc_file = "/datasets/airlines_all_orc_parts" hdfs_csv_file = "/datasets/air_csv_part" col_types = ['real', 'real', 'real', 'real', 'real', 'real', 'real', 'real', 'enum', 'real', 'enum', 'real', 'real', 'enum', 'real', 'real', 'enum', 'enum', 'real', 'enum', 'enum', 'real', 'real', 'real', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum', 'enum'] # import CSV file print("Import airlines 116M dataset in original csv format from HDFS") url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) startcsv = time.time() multi_file_csv = h2o.import_file(url_csv, na_strings=['\\N'], col_types=col_types) endcsv = time.time() startcsv1 = time.time() multi_file_csv1 = h2o.import_file(url_csv) endcsv1 = time.time() h2o.remove(multi_file_csv1) multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] # import ORC file with same column types as CSV file print("Import airlines 116M dataset in ORC format from HDFS") url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) startorc1 = time.time() multi_file_orc1 = h2o.import_file(url_orc) endorc1 = time.time() h2o.remove(multi_file_orc1) startorc = time.time() multi_file_orc = h2o.import_file(url_orc, col_types=col_types) endorc = time.time() multi_file_orc.summary() orc_summary = h2o.frame(multi_file_orc.frame_id)["frames"][0]["columns"] print("************** CSV (without column type forcing) parse time is {0}".format(endcsv1-startcsv1)) print("************** CSV (with column type forcing) parse time is {0}".format(endcsv-startcsv)) print("************** ORC (without column type forcing) parse time is {0}".format(endorc1-startorc1)) print("************** ORC (with column type forcing) parse time is {0}".format(endorc-startorc)) # compare frame read by orc by forcing column type, pyunit_utils.compare_frame_summary(csv_summary, orc_summary) else: raise EnvironmentError