def test_deprecated_noise_level_param_is_alias_for_noise(): ds = load_dataset(incl_test=True) te = H2OTargetEncoderEstimator() te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) # print(encoded) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") te_nl = H2OTargetEncoderEstimator(noise_level=0) assert len(w) == 1 assert issubclass(w[0].category, H2ODeprecationWarning) assert "``noise_level`` param of ``{}`` is deprecated".format( te_init_name) in str(w[0].message) te_nl.train(y=ds.target, training_frame=ds.train) encoded_nl = te_nl.predict(ds.test) # print(encoded_nl) te_n = H2OTargetEncoderEstimator(noise=0) te_n.train(y=ds.target, training_frame=ds.train) encoded_n = te_n.predict(ds.test) # print(encoded_n) try: pu.compare_frames(encoded_nl, encoded, 0, tol_numeric=1e-5) assert False, "should have raised" except AssertionError as ae: assert "should have raised" not in str(ae) assert pu.compare_frames(encoded_nl, encoded_n, 0, tol_numeric=1e-5)
def test_transform_can_override_blending_parameters(): ds = load_dataset(incl_test=True) te = H2OTargetEncoderEstimator(noise=0) te.train(y=ds.target, training_frame=ds.train) transformed = te.transform(ds.test) transformed_blending = te.transform(ds.test, blending=True) try: assert pu.compare_frames(transformed, transformed_blending, 0, tol_numeric=1e-5) assert False, "should have raised" except AssertionError as ae: assert "should have raised" not in str(ae) transformed_blending_custom = te.transform(ds.test, blending=True, inflection_point=3, smoothing=17) try: assert pu.compare_frames(transformed_blending_custom, transformed_blending, 0, tol_numeric=1e-5) assert False, "should have raised" except AssertionError as ae: assert "should have raised" not in str(ae)
def gbm_on_hive(): connection_url = "jdbc:hive2://localhost:10000/default" krb_enabled = os.getenv('KRB_ENABLED', 'false') if krb_enabled.lower() == 'true': connection_url += ";principal=%s" % os.getenv('HIVE_PRINCIPAL', 'hive/[email protected]') select_query = "select * from airlinestest" username = "******" password = "" airlines_dataset_original = h2o.import_file(path="https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/AirlinesTest.csv.zip") airlines_dataset = h2o.import_sql_select(connection_url, select_query, username, password) pyunit_utils.compare_frames(airlines_dataset_original, airlines_dataset, 100, tol_numeric=0) airlines_dataset["table_for_h2o_import.origin"] = airlines_dataset["table_for_h2o_import.origin"].asfactor() airlines_dataset["table_for_h2o_import.fdayofweek"] = airlines_dataset["table_for_h2o_import.fdayofweek"].asfactor() airlines_dataset["table_for_h2o_import.uniquecarrier"] = airlines_dataset["table_for_h2o_import.uniquecarrier"].asfactor() airlines_dataset["table_for_h2o_import.dest"] = airlines_dataset["table_for_h2o_import.dest"].asfactor() airlines_dataset["table_for_h2o_import.fyear"] = airlines_dataset["table_for_h2o_import.fyear"].asfactor() airlines_dataset["table_for_h2o_import.fdayofmonth"] = airlines_dataset["table_for_h2o_import.fdayofmonth"].asfactor() airlines_dataset["table_for_h2o_import.isdepdelayed"] = airlines_dataset["table_for_h2o_import.isdepdelayed"].asfactor() airlines_dataset["table_for_h2o_import.fmonth"] = airlines_dataset["table_for_h2o_import.fmonth"].asfactor() airlines_X_col_names = airlines_dataset.col_names[:-2] airlines_y_col_name = airlines_dataset.col_names[-2] train, valid, test = airlines_dataset.split_frame([0.6, 0.2], seed=1234) from h2o.estimators.gbm import H2OGradientBoostingEstimator gbm_v1 = H2OGradientBoostingEstimator(model_id="gbm_airlines_v1", seed=2000000) gbm_v1.train(airlines_X_col_names, airlines_y_col_name, training_frame=train, validation_frame=valid) gbm_v1.predict(test)
def h2o_H2OFrame_rep_len(): """ Python API test: h2o.frame.H2OFrame.rep_len(length_out) """ row_num = randrange(1, 10) col_num = randrange(1, 10) length_out_r = math.ceil(0.78 * row_num) python_lists = np.random.randint(-5, 5, (row_num, col_num)) h2oframe = h2o.H2OFrame(python_obj=python_lists) one_column = h2oframe[0].rep_len( length_out=(length_out_r + row_num)) # one column, duplicate row assert_is_type(one_column, H2OFrame) # check return type # check shape assert one_column.shape == ( length_out_r + row_num, 1), "h2o.H2OFrame.rep_len() command is not working." # check values repeat_row_start = row_num repeat_row_end = row_num + length_out_r pyunit_utils.compare_frames(h2oframe[0:length_out_r, 0], one_column[repeat_row_start:repeat_row_end, 0], length_out_r, tol_time=0, tol_numeric=1e-6, strict=False, compare_NA=True)
def h2o_H2OFrame_top_bottomN(): """ PUBDEV-3624 Top or Bottom N test h2o.frame.H2OFrame.topN() and h2o.frame.H2OFrame.bottomN() functions. Given a H2O frame, a column index or column name, a double denoting percentages of top/bottom rows to return, the topN will return a H2OFrame containing two columns, one will be the topN (or bottomN) values of the specified column. The other column will record the row indices into the original frame of where the topN (bottomN) values come from. This will let the users to grab those corresponding rows to do whatever they want with it. """ dataFrame = h2o.import_file( pyunit_utils.locate("bigdata/laptop/jira/TopBottomNRep4.csv.zip")) topAnswer = h2o.import_file( pyunit_utils.locate("smalldata/jira/Top20Per.csv.zip")) bottomAnswer = h2o.import_file( pyunit_utils.locate("smalldata/jira/Bottom20Per.csv.zip")) nPercentages = [1, 2, 3, 4] # multiples of 4 since dataset is repeated 4 times. frameNames = dataFrame.names # get data column names tolerance = 1e-12 nsample = 100 nP = nPercentages[randint(0, len(nPercentages) - 1)] # pick a random percentage colIndex = randint(0, len(frameNames) - 1) # pick a random column if (randint(0, 2) == 0): print( "For topN: Percentage chosen is {0}. Column index chosen is {1}". format(nP, colIndex)) newTopFrame = dataFrame.topN(frameNames[colIndex], nP) # call topN with column names newTopFrameC = dataFrame.topN(colIndex, nP) # call topN with same column index # the two return frames should be the same for this case, compare 1000 rows chosen randomly pyunit_utils.compare_frames(newTopFrame, newTopFrameC, nsample, tol_numeric=tolerance) # compare one of the return frames with known answer compare_rep_frames(topAnswer, newTopFrame, tolerance, colIndex, 0) else: # test bottomN here print( "For bottomN: Percentage chosen is {0}. Column index chosen is {1}" .format(nP, colIndex)) newBottomFrame = dataFrame.bottomN(frameNames[colIndex], nP) # call topN with column names newBottomFrameC = dataFrame.bottomN( colIndex, nP) # call topN with same column index # the two return frames should be the same for this case pyunit_utils.compare_frames(newBottomFrame, newBottomFrameC, nsample, tol_numeric=tolerance) # compare one of the return frames with known answer compare_rep_frames(bottomAnswer, newBottomFrame, tolerance, colIndex, 1)
def test_deprecated_k_param_is_alias_for_inflection_point(): ds = load_dataset(incl_test=True) te = H2OTargetEncoderEstimator(noise=0) te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) # print(encoded) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") te_k = H2OTargetEncoderEstimator(noise=0, k=5, blending=True) assert len(w) == 1 assert issubclass(w[0].category, H2ODeprecationWarning) assert "``k`` param of ``{}`` is deprecated".format( te_init_name) in str(w[0].message) te_k.train(y=ds.target, training_frame=ds.train) encoded_k = te_k.predict(ds.test) # print(encoded_k) te_ip = H2OTargetEncoderEstimator(noise=0, inflection_point=5, blending=True) te_ip.train(y=ds.target, training_frame=ds.train) encoded_ip = te_ip.predict(ds.test) # print(encoded_ip) try: pu.compare_frames(encoded_k, encoded, 0, tol_numeric=1e-5) assert False, "should have raised" except AssertionError as ae: assert "should have raised" not in str(ae) assert pu.compare_frames(encoded_k, encoded_ip, 0, tol_numeric=1e-5)
def test_target_encoding_transform_none_blending(): print("Check none strategy with and without blending") targetColumnName = "survived" teColumns = ["home.dest", "cabin", "embarked"] teColumnsEncoded = list(map(lambda x: x+"_te", teColumns)) trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor() targetEncoderWithBlending = TargetEncoder(x= teColumns, y= targetColumnName, blended_avg= True, inflection_point = 3, smoothing = 1) targetEncoderWithBlending.fit(frame=trainingFrame) encodedFrameWithBlending = targetEncoderWithBlending.transform(frame=trainingFrame, holdout_type="none", seed=1234) frameWithBlendedEncodingsOnly = encodedFrameWithBlending[teColumnsEncoded] targetEncoderWithoutBlending = TargetEncoder(x= teColumns, y= targetColumnName, blended_avg= False, inflection_point = 3, smoothing = 1) targetEncoderWithoutBlending.fit(frame=trainingFrame) encodedFrameWithoutBlending = targetEncoderWithoutBlending.transform(frame=trainingFrame, holdout_type="none", seed=1234) encodedFrameWithoutBlendingOnly = encodedFrameWithoutBlending[teColumnsEncoded] try: pyunit_utils.compare_frames(frameWithBlendedEncodingsOnly, encodedFrameWithoutBlendingOnly, 10, tol_time=0, tol_numeric=1e-6) assert False except AssertionError: print('Good, encodings are different as expected. Hopefully because of the blending.')
def gbm_on_hive(): connection_url = "jdbc:hive2://localhost:10000/default" krb_enabled = os.getenv('KRB_ENABLED', 'false').lower() == 'true' use_token = os.getenv('KRB_USE_TOKEN', 'false').lower() == 'true' if krb_enabled: if use_token: connection_url += ";auth=delegationToken" else: connection_url += ";principal=%s" % os.getenv('HIVE_PRINCIPAL', 'hive/[email protected]') select_query = "select * from airlinestest" username = "******" password = "" # read from S3 airlines_dataset_original = h2o.import_file(path="https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/AirlinesTest.csv.zip") # read from Hive Streaming airlines_dataset_streaming = h2o.import_sql_select(connection_url, select_query, username, password, fetch_mode="SINGLE") airlines_dataset_streaming = adapt_airlines(airlines_dataset_streaming) # datasets should be identical from user's point of view pyunit_utils.compare_frames(airlines_dataset_original, airlines_dataset_streaming, 100, tol_numeric=0) from h2o.estimators.gbm import H2OGradientBoostingEstimator airlines_X_col_names = airlines_dataset_streaming.col_names[:-2] airlines_y_col_name = airlines_dataset_streaming.col_names[-2] gbm_v1 = H2OGradientBoostingEstimator(model_id="gbm_airlines_v1", seed=2000000) gbm_v1.train(airlines_X_col_names, airlines_y_col_name, training_frame=airlines_dataset_streaming, validation_frame=airlines_dataset_streaming) print(gbm_v1) # demonstrates that metrics can be slightly different due to different chunking on the backend assert isclose(gbm_v1.auc(train=True), gbm_v1.auc(valid=True), rtol=1e-4)
def test_transform_can_be_applied_to_training_frame_with_special_flag(): ds = load_dataset() te = H2OTargetEncoderEstimator() te.train(y=ds.target, training_frame=ds.train) transformed_as_training = te.transform(ds.train, as_training=True) transformed = te.transform(ds.train) assert pu.compare_frames(transformed, transformed_as_training, 0, tol_numeric=1e-5) # now with non default params te_nd = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out", blending=True, inflection_point=5, smoothing=17, seed=seed, noise=0.01) te_nd.train(y=ds.target, training_frame=ds.train) transformed_as_training = te_nd.transform(ds.train, as_training=True) transformed = te_nd.transform(ds.train) try: assert pu.compare_frames(transformed, transformed_as_training, 0, tol_numeric=1e-5) assert False, "should have raised" except AssertionError as ae: assert "should have raised" not in str(ae)
def sortOrMerge(): # PUBDEV-5266 sort/merge with string columns but not on string columns # test either the merge or the sort part name1 = "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/PUBDEV_5266_f1.csv" name2 = "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/PUBDEV_5266_f2.csv" c1names = ["stringf1-1", "stringf1-2", "int1", "intf1-1"] c2names = ["stringf2-1","intf2-1", "iintf2-2", "stringf2-2","intf2-3", "stringf2-3", "stringf2-4", "int1"] f1names = [name1, name1, name1] f2names = [name2, name2, name2] ansNames = ["bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/sortedF1_R_C3_C4.csv", "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/mergedf1_f2unique.csv", "bigdata/laptop/jira/PUBDEV_5266_merge_with_string_columns/mergedf1_f2unique_x_T.csv"] xvals = [False,False,True] yvals = [False,False,False] f1colnames = [c1names, c1names, c1names] f2colnames = [c2names, c2names, c2names] numTests = len(xvals)-1 runIndex = random.randint(0,numTests) if runIndex==0: # perform sorting first f1 = h2o.import_file(pyunit_utils.locate(f1names[runIndex])) sorted_column_indices = [2, 3] h2oSortf1 = f1.sort(sorted_column_indices) coltypes = getTypes(h2oSortf1) f1sortedR = h2o.import_file(pyunit_utils.locate(ansNames[runIndex]), col_types=coltypes, header=1) assert pyunit_utils.compare_frames(f1sortedR, h2oSortf1, 100, tol_numeric=0) else: # test merging here f1 = h2o.import_file(pyunit_utils.locate(f1names[runIndex]),header=1) f1.set_names(f1colnames[runIndex]) f2 = h2o.import_file(pyunit_utils.locate(f2names[runIndex]),header=1) f2.set_names(f2colnames[runIndex]) mergedh2o = f1.merge(f2,all_x=xvals[runIndex],all_y=yvals[runIndex], method='auto') coltypes = getTypes(mergedh2o) f1mergedf2 = h2o.import_file(pyunit_utils.locate(ansNames[runIndex]), col_types=coltypes, header=1) assert pyunit_utils.compare_frames(f1mergedf2, mergedh2o, 100, tol_numeric=0)
def sort(): try: df = h2o.H2OFrame({"A":["another", "set", "of", "bad", "string"], "B":[10, 1, 2, 5, 7], "C":["what", "is", "this", "thing", "doing"]}) dfIntSorted = h2o.H2OFrame({"B":[1,2,5,7,10]}) dfSortedIntCN = df.sort("B") pyunit_utils.compare_frames(dfIntSorted, dfSortedIntCN, df.nrow) assert False, "Sort could not work with String columns and an error should have been thrown but not..." except: assert True # expected error here as sort will not work with String columns in the frame
def sort(): try: df = h2o.H2OFrame({ "A": ["another", "set", "of", "bad", "string"], "B": [10, 1, 2, 5, 7], "C": ["what", "is", "this", "thing", "doing"] }) dfIntSorted = h2o.H2OFrame({"B": [1, 2, 5, 7, 10]}) dfSortedIntCN = df.sort("B") pyunit_utils.compare_frames(dfIntSorted, dfSortedIntCN, df.nrow) assert False, "Sort could not work with String columns and an error should have been thrown but not..." except: assert True # expected error here as sort will not work with String columns in the frame
def tf_idf_small_data(preprocess, case_sens, cols=None): if cols is None: cols = [0, 1] input_fr = get_simple_input_test_frame( ) if preprocess else get_simple_preprocessed_input_test_frame() expected_fr = get_expected_output_frame_case_sens( ) if case_sens else get_expected_output_frame_case_insens() out_frame = tf_idf(input_fr, cols[0], cols[1], preprocess, case_sens) pyunit_utils.compare_frames(expected_fr, out_frame, len(out_frame), tol_numeric=1e-5, compare_NA=False)
def parquet_parse_dates(): parquet_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/parser/parquet/parquet-file-with-date-column.snappy.parquet" )) parquet_data.summary() parquet_summary = h2o.frame(parquet_data.frame_id)["frames"][0]["columns"] date_converted_column_type = parquet_summary[2]['type'] assert date_converted_column_type == "time" date_string_rows = parquet_data[:, "date_string"] date_converted_rows = parquet_data[:, "date_converted"] pyunit_utils.compare_frames(date_string_rows, date_converted_rows, 1)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", "/datasets/orc_parser/orc/orc_split_elim.orc"] allCsvFiles = ["/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv", "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv", "/datasets/orc_parser/csv/orc_split_elim.csv"] for fIndex in range(len(allOrcFiles)): url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex]) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def orc_parser_timestamp_date(): """ This test will parse orc files containing timestamp and date information into H2O frame. Next, it will take the .csv file generated from the orc file from Hive and parse into H2O frame. Finally, we compare the two frames and make sure that they are equal. We want to make sure that we are parsing the date and timestamp date correctly from an orc file. Thanks to Nidhi who has imported an orc file containing timestamp/date into spark and later into Hive and write it out as csv. :return: None """ tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. allOrcFiles = ["smalldata/parser/orc/TestOrcFile.testDate1900.orc", "smalldata/parser/orc/TestOrcFile.testDate2038.orc", "smalldata/parser/orc/orc_split_elim.orc"] allCsvFiles = ["smalldata/parser/orc/orc2csv/TestOrcFile.testDate1900.csv", "smalldata/parser/orc/orc2csv/TestOrcFile.testDate2038.csv", "smalldata/parser/orc/orc2csv/orc_split_elim.csv"] for fIndex in range(len(allOrcFiles)): h2oOrc = h2o.import_file(path=pyunit_utils.locate(allOrcFiles[fIndex])) h2oCsv = h2o.import_file(path=pyunit_utils.locate(allCsvFiles[fIndex])) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!"
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. hdfs_csv_file = "/datasets/orc_parser/synthetic_perfect_separation_csv" hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) multi_file_csv = h2o.import_file(url_csv) multi_file_orc = h2o.import_file(url_orc) # make sure orc multi-file and single big file create same H2O frame assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv, numElements2Compare, tol_time, tol_numeric,True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 10 # choose number of elements per column to compare. Save test time. hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def import_folder(): tol_time = 200 # comparing in ms or ns for timestamp columns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. multi_file_csv = h2o.import_file(path=pyunit_utils.locate( "smalldata/parser/hexdev_497/airlines_first_header")) multi_file_gzip_comp = \ h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip")) try: # make sure the two agrees assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" except: # in case the files are listed differently, we can always just check to see if the summary agrees. multi_file_gzip_comp.summary() zip_summary = h2o.frame( multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() numElements2Compare = 100 tol_time = 200 tol_numeric = 1e-5 hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oframe_csv = h2o.import_file(url_csv) data_types = ['real', 'real', 'real', 'real', 'enum'] h2oframe_orc = h2o.import_file(url_orc, col_types = data_types) # compare the two frames assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def import_folder(): tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. # compressed the whole directory of files. multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/parser/hexdev_497/milsongs_csv.zip")) # directory containing the gzip version of csv files here. multi_file_csv = h2o.import_file(path=pyunit_utils.locate( "bigdata/laptop/parser/hexdev_497/milsongs_csv_gzip")) try: # make sure the two agrees assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" except: # in case the files are listed differently, we can always just check to see if the summary agrees. multi_file_gzip_comp.summary() zip_summary = h2o.frame( multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame( multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def titanic(): df = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), col_types={'pclass': "enum", 'survived': "enum"}) x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] # Split the dataset into train and test train, test = df.split_frame(ratios=[.8], seed=1234) rfit = H2ORuleFitEstimator(min_rule_length=4, max_rule_length=5, max_num_rules=3, seed=1234, model_type="rules") rfit.train(training_frame=train, x=x, y="survived", validation_frame=test) assert rfit.rmse(valid=True) is not None, "validation metrics should be present" print(rfit.rule_importance()) assert rfit._model_json["output"]["model_summary"] is not None, "model_summary should be present" assert len(rfit._model_json["output"]["model_summary"]._cell_values) > 0, "model_summary's content should be present" rfit_predictions = rfit.predict(test) import tempfile tmpdir = tempfile.mkdtemp() try: mojo_path = rfit.save_mojo(tmpdir) mojo_model = h2o.upload_mojo(mojo_path) finally: import shutil shutil.rmtree(tmpdir) mojo_predictions = mojo_model.predict(test) assert pyunit_utils.compare_frames(rfit_predictions, mojo_predictions, 0)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() # run a quick test to determine if the hive-exec is too old. if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_prostate_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 10 # choose number of elements per column to compare. Save test time. hdfs_orc_file = "/datasets/orc_parser/orc/prostate_NA.orc" hdfs_csv_file = "/datasets/orc_parser/csv/prostate_NA.csv" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_iris_import_types_orc.py")) pass else: numElements2Compare = 100 tol_time = 200 tol_numeric = 1e-5 hdfs_orc_file = "/datasets/orc_parser/orc/iris.orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) hdfs_csv_file = "/datasets/orc_parser/csv/iris.csv" url_csv = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file) h2oframe_csv = h2o.import_file(url_csv) data_types = ['real', 'real', 'real', 'real', 'enum'] h2oframe_orc = h2o.import_file(url_orc, col_types = data_types) # compare the two frames assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def test_transform_seed_param_raise_warning(): ds = load_dataset(incl_test=True) te = H2OTargetEncoderEstimator(seed=42) te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) transformed_1 = te.transform(ds.test) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") transformed_2 = te.transform(ds.test, seed=24) assert len(w) == 1 assert issubclass(w[0].category, H2ODeprecationWarning) assert "`seed` is deprecated in `transform` method and will be ignored" in str( w[0].message) assert pu.compare_frames(encoded, transformed_1, 0, tol_numeric=1e-5) assert pu.compare_frames(encoded, transformed_2, 0, tol_numeric=1e-5)
def import_folder(): tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. multi_file_csv1 = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_seperation_csv/balunbal.csv")) multi_file_csv2 = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_seperation_csv/unbalbal.csv")) multi_file_orc = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_separation")) # make sure orc multi-file and single big file create same H2O frame try: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc files are different!" except: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc files are different!"
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible( ) if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest( hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print( "Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. hdfs_csv_file1 = "/datasets/orc_parser/csv/balunbal.csv" url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file1) multi_file_csv1 = h2o.import_file(url_csv1) hdfs_csv_file2 = "/datasets/orc_parser/csv/unbalbal.csv" url_csv2 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file2) multi_file_csv2 = h2o.import_file(url_csv2) hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) multi_file_orc = h2o.import_file(url_orc) # make sure orc multi-file and single big file create same H2O frame try: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare, tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \ "H2O frame parsed from multiple orc and single orc files are different!" except: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare, tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \ "H2O frame parsed from multiple orc and single orc files are different!" else: raise EnvironmentError
def iris(): df = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris_train.csv"), col_types={'species': "enum"}) x = df.columns y = "species" x.remove(y) # Split the dataset into train and test train, test = df.split_frame(ratios=[.8], seed=1234) rfit = H2ORuleFitEstimator(min_rule_length=4, max_rule_length=5, max_num_rules=3, seed=1234, model_type="rules") rfit.train(training_frame=train, x=x, y=y, validation_frame=test) assert rfit.rmse( valid=True) is not None, "validation metrics should be present" print(rfit.rule_importance()) assert rfit._model_json["output"][ "model_summary"] is not None, "model_summary should be present" assert len(rfit._model_json["output"]["model_summary"]._cell_values ) > 0, "model_summary's content should be present" rfit_predictions = rfit.predict(test) frame = rfit.predict_rules(train, ['M0T38N5_Iris-virginica']) assert frame.sum().getrow()[0] == 49.0 import tempfile tmpdir = tempfile.mkdtemp() try: mojo_path = rfit.save_mojo(tmpdir) mojo_model = h2o.upload_mojo(mojo_path) finally: import shutil shutil.rmtree(tmpdir) mojo_predictions = mojo_model.predict(test) assert pyunit_utils.compare_frames(rfit_predictions, mojo_predictions, 0) # test predict_rules also on linear variable input rfit = H2ORuleFitEstimator(min_rule_length=4, max_rule_length=5, max_num_rules=3, seed=1234, model_type="rules_and_linear") rfit.train(training_frame=train, x=x, y=y, validation_frame=test) print(rfit.rule_importance()) frame = rfit.predict_rules( train, ['linear.petal_len_Iris-setosa', 'linear.petal_wid_Iris-virginica']) assert frame.sum().getrow()[0] == train.nrows
def test_transform_produces_the_same_result_as_predict_by_default(): ds = load_dataset(incl_test=True) te = H2OTargetEncoderEstimator() te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) transformed = te.transform(ds.test) assert pu.compare_frames(encoded, transformed, 0, tol_numeric=1e-5) # now with non default params te_nd = H2OTargetEncoderEstimator(data_leakage_handling="leave_one_out", blending=True, inflection_point=5, smoothing=17, seed=seed, noise=0.01) te_nd.train(y=ds.target, training_frame=ds.train) encoded = te_nd.predict(ds.test) transformed = te_nd.transform(ds.test) assert pu.compare_frames(encoded, transformed, 0, tol_numeric=1e-5)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_import_folder_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. hdfs_csv_file1 = "/datasets/orc_parser/csv/balunbal.csv" url_csv1 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file1) multi_file_csv1 = h2o.import_file(url_csv1) hdfs_csv_file2 = "/datasets/orc_parser/csv/unbalbal.csv" url_csv2 = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_csv_file2) multi_file_csv2 = h2o.import_file(url_csv2) hdfs_orc_file = "/datasets/orc_parser/synthetic_perfect_separation_orc" url_orc = "hdfs://{0}{1}".format(hdfs_name_node, hdfs_orc_file) multi_file_orc = h2o.import_file(url_orc) # make sure orc multi-file and single big file create same H2O frame try: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv1, numElements2Compare, tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \ "H2O frame parsed from multiple orc and single orc files are different!" except: assert pyunit_utils.compare_frames(multi_file_orc , multi_file_csv2, numElements2Compare, tol_time=tol_time, tol_numeric=tol_numeric, strict=True), \ "H2O frame parsed from multiple orc and single orc files are different!" else: raise EnvironmentError
def hive_jdbc_import(): connection_url = "jdbc:hive2://localhost:10000/default" krb_enabled = os.getenv('KRB_ENABLED', 'false') if krb_enabled.lower() == 'true': connection_url += ";principal=%s" % os.getenv('HIVE_PRINCIPAL', 'hive/[email protected]') hive_dist_enabled = os.getenv('HIVE_DIST_ENABLED', 'true').lower() == 'true' select_query = "select * from airlinestest" username = "******" password = "" # read from S3 airlines_dataset_original = h2o.import_file( path= "https://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/AirlinesTest.csv.zip" ) # read from Hive Distributed if hive_dist_enabled: airlines_dataset_dist = h2o.import_sql_select(connection_url, select_query, username, password) airlines_dataset_dist = adapt_airlines(airlines_dataset_dist) pyunit_utils.compare_frames(airlines_dataset_original, airlines_dataset_dist, 100, tol_numeric=0) # read from Hive Streaming airlines_dataset_streaming = h2o.import_sql_select(connection_url, select_query, username, password, fetch_mode="SINGLE") airlines_dataset_streaming = adapt_airlines(airlines_dataset_streaming) pyunit_utils.compare_frames(airlines_dataset_original, airlines_dataset_streaming, 100, tol_numeric=0)
def test_default_strategy_is_none(): ds = load_dataset(incl_test=True) te = H2OTargetEncoderEstimator(noise=0) te.train(y=ds.target, training_frame=ds.train) encoded = te.predict(ds.test) te_none = H2OTargetEncoderEstimator(data_leakage_handling="none", noise=0) te_none.train(y=ds.target, training_frame=ds.train) encoded_none = te_none.predict(ds.test) assert pu.compare_frames(encoded, encoded_none, 0, tol_numeric=1e-5)
def h2o_H2OFrame_rep_len(): """ Python API test: h2o.frame.H2OFrame.rep_len(length_out) """ row_num = randrange(1,10) col_num = randrange(1,10) length_out_r = math.ceil(0.78*row_num) python_lists = np.random.randint(-5,5, (row_num, col_num)) h2oframe = h2o.H2OFrame(python_obj=python_lists) one_column = h2oframe[0].rep_len(length_out=(length_out_r+row_num)) # one column, duplicate row assert_is_type(one_column, H2OFrame) # check return type # check shape assert one_column.shape == (length_out_r+row_num, 1), "h2o.H2OFrame.rep_len() command is not working." # check values repeat_row_start = row_num repeat_row_end = row_num+length_out_r pyunit_utils.compare_frames(h2oframe[0:length_out_r,0], one_column[repeat_row_start:repeat_row_end, 0], length_out_r, tol_time=0, tol_numeric=1e-6, strict=False, compare_NA=True)
def test_transform_can_override_noise(): ds = load_dataset(incl_test=True) noise = 1e-3 te = H2OTargetEncoderEstimator(noise=noise, seed=seed) te.train(y=ds.target, training_frame=ds.train) transformed = te.transform(ds.test) transformed_no_noise = te.transform(ds.test, noise=0) try: assert pu.compare_frames(transformed, transformed_no_noise, 0, tol_numeric=noise / 10) assert False, "should have raised" except AssertionError as ae: assert "should have raised" not in str(ae) assert pu.compare_frames(transformed, transformed_no_noise, 0, tol_numeric=noise)
def continuous_or_categorical(): numElements2Compare = 0 tol_time = 200 tol_numeric = 1e-5 ctypes = ["enum"]*3 h2oframe_csv = h2o.import_file(pyunit_utils.locate("smalldata/jira/hexdev_29.csv"), col_types=ctypes) h2oframe_orc = h2o.import_file(pyunit_utils.locate("smalldata/parser/orc/hexdev_29.orc"), col_types=ctypes) # compare the two frames assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, True), \ "H2O frame parsed from orc and csv files are different!"
def h2o_H2OFrame_transpose(): """ Python API test: h2o.frame.H2OFrame.transpose() """ row_num = randrange(1, 10) col_num = randrange(1, 10) python_lists = np.random.randint(-5, 5, (row_num, col_num)) h2oframe = h2o.H2OFrame(python_obj=python_lists) newFrame = h2oframe.transpose() assert_is_type(newFrame, H2OFrame) # check return type # check shape assert newFrame.shape == ( h2oframe.ncol, h2oframe.nrow), "h2o.H2OFrame.transpose() command is not working." # check content pyunit_utils.compare_frames(h2oframe, newFrame.transpose(), h2oframe.nrow, tol_time=0, tol_numeric=1e-6)
def continuous_or_categorical_orc(): numElements2Compare = 100 tol_time = 200 tol_numeric = 1e-5 h2oframe_csv = h2o.import_file(pyunit_utils.locate("smalldata/iris/iris.csv")) data_types = ['real', 'real', 'real', 'real', 'enum'] h2oframe_orc = h2o.import_file(pyunit_utils.locate("smalldata/parser/orc/iris.orc"), col_types = data_types) # compare the two frames assert pyunit_utils.compare_frames(h2oframe_orc, h2oframe_csv, numElements2Compare, tol_time, tol_numeric, True), \ "H2O frame parsed from orc and csv files are different!"
def h2o_H2OFrame_top_bottomN(): """ PUBDEV-3624 Top or Bottom N test h2o.frame.H2OFrame.topN() and h2o.frame.H2OFrame.bottomN() functions. Given a H2O frame, a column index or column name, a double denoting percentages of top/bottom rows to return, the topN will return a H2OFrame containing two columns, one will be the topN (or bottomN) values of the specified column. The other column will record the row indices into the original frame of where the topN (bottomN) values come from. This will let the users to grab those corresponding rows to do whatever they want with it. """ dataFrame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/jira/TopBottomNRep4.csv.zip")) topAnswer = h2o.import_file(pyunit_utils.locate("smalldata/jira/Top20Per.csv.zip")) bottomAnswer = h2o.import_file(pyunit_utils.locate("smalldata/jira/Bottom20Per.csv.zip")) nPercentages = [1,2,3,4] # multiples of 4 since dataset is repeated 4 times. frameNames = dataFrame.names # get data column names tolerance=1e-12 nsample = 100 nP = nPercentages[randint(0, len(nPercentages)-1)] # pick a random percentage colIndex = randint(0, len(frameNames)-1) # pick a random column if (randint(0,2)==0): print("For topN: Percentage chosen is {0}. Column index chosen is {1}".format(nP, colIndex)) newTopFrame = dataFrame.topN(frameNames[colIndex], nP) # call topN with column names newTopFrameC = dataFrame.topN(colIndex, nP) # call topN with same column index # the two return frames should be the same for this case, compare 1000 rows chosen randomly pyunit_utils.compare_frames(newTopFrame, newTopFrameC, nsample, tol_numeric=tolerance) # compare one of the return frames with known answer compare_rep_frames(topAnswer, newTopFrame, tolerance, colIndex, 1) else: # test bottomN here print("For bottomN: Percentage chosen is {0}. Column index chosen is {1}".format(nP, colIndex)) newBottomFrame = dataFrame.bottomN(frameNames[colIndex], nP) # call topN with column names newBottomFrameC = dataFrame.bottomN(colIndex, nP) # call topN with same column index # the two return frames should be the same for this case pyunit_utils.compare_frames(newBottomFrame, newBottomFrameC, nsample, tol_numeric=tolerance) # compare one of the return frames with known answer compare_rep_frames(bottomAnswer, newBottomFrame, tolerance, colIndex, -1)
def import_folder(): tol_time = 200 # comparing in ms or ns for timestamp columns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_small_csv.zip")) multi_file_csv = \ h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_small_csv/all_airlines.csv")) # make sure H2O frames built from a zip file of a directory and the original files are the same. assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from zip directory and unzipped directory are different!"
def import_folder(): tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. multi_file_csv1 = h2o.import_file( path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_seperation_csv/balunbal.csv") ) multi_file_csv2 = h2o.import_file( path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_seperation_csv/unbalbal.csv") ) multi_file_orc = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/orc/synthetic_perfect_separation")) # make sure orc multi-file and single big file create same H2O frame try: assert pyunit_utils.compare_frames( multi_file_orc, multi_file_csv1, numElements2Compare, tol_time, tol_numeric, True ), "H2O frame parsed from multiple orc and single orc files are different!" except: assert pyunit_utils.compare_frames( multi_file_orc, multi_file_csv2, numElements2Compare, tol_time, tol_numeric, True ), "H2O frame parsed from multiple orc and single orc files are different!"
def test_target_encoding_default_noise_is_applied(): print("Check that seed is applied when we use noise. Noise is set to the same values. Only seed is different.") targetColumnName = "survived" teColumns = ["home.dest", "cabin", "embarked"] teColumnsEncoded = list(map(lambda x: x+"_te", teColumns)) trainingFrame = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/titanic.csv"), header=1) trainingFrame[targetColumnName] = trainingFrame[targetColumnName].asfactor() targetEncoder = TargetEncoder(x= teColumns, y= targetColumnName, blended_avg= True, inflection_point = 3, smoothing = 1) targetEncoder.fit(frame=trainingFrame) seedTest = 1234 encodedFrame = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=0.0, seed=seedTest) encodingsOnly = encodedFrame[teColumnsEncoded] # Second transformation without specifying noise. Default will be applied. encodedFrame2 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", seed=seedTest) encodingsOnly2 = encodedFrame2[teColumnsEncoded] # Third transformation with zero noise encodedFrame3 = targetEncoder.transform(frame=trainingFrame, holdout_type="none", noise=0.0, seed=seedTest) encodingsOnly3 = encodedFrame3[teColumnsEncoded] # Comparing results # Third encoding should be equal to the first one since no noise is applied in both cases assert pyunit_utils.compare_frames(encodingsOnly, encodingsOnly3, 10, tol_time=0, tol_numeric=1e-6) # First two encodings should be different since default noise will be applied to the second transformation try: pyunit_utils.compare_frames(encodingsOnly, encodingsOnly2, 10, tol_time=0, tol_numeric=1e-6) assert False except AssertionError: print('Good, encodings are different as expected. Default noise is working')
def orc_parser_timestamp_date(): """ To verify that the orc parser is parsing correctly, we want to take a file we know (prostate_NA.csv), convert it to an Orc file (prostate_NA.orc) and build two H2O frames out of them. We compare them and verified that they are the same. Nidhi did this manually in Hive and verified that the parsing is correct. I am automating the test here. :return: None """ tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 10 # choose number of elements per column to compare. Save test time. h2oOrc = h2o.import_file(path=pyunit_utils.locate('smalldata/parser/orc/prostate_NA.orc')) h2oCsv = h2o.import_file(path=pyunit_utils.locate('smalldata/parser/csv2orc/prostate_NA.csv')) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!"
def import_folder(): tol_time = 200 # comparing in ms or ns for timestamp columns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 0 # choose number of elements per column to compare. Save test time. multi_file_csv = h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header")) multi_file_gzip_comp = \ h2o.import_file(path=pyunit_utils.locate("smalldata/parser/hexdev_497/airlines_first_header.zip")) try: # make sure the two agrees assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" except: # in case the files are listed differently, we can always just check to see if the summary agrees. multi_file_gzip_comp.summary() zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def hdfs_orc_parser(): # Check if we are running inside the H2O network by seeing if we can touch # the namenode. hadoop_namenode_is_accessible = pyunit_utils.hadoop_namenode_is_accessible() if hadoop_namenode_is_accessible: hdfs_name_node = pyunit_utils.hadoop_namenode() if pyunit_utils.cannaryHDFSTest(hdfs_name_node, "/datasets/orc_parser/orc/orc_split_elim.orc"): print("Your hive-exec version is too old. Orc parser test {0} is " "skipped.".format("pyunit_INTERNAL_HDFS_timestamp_date_orc.py")) pass else: tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. allOrcFiles = ["/datasets/orc_parser/orc/TestOrcFile.testDate1900.orc", "/datasets/orc_parser/orc/TestOrcFile.testDate2038.orc", "/datasets/orc_parser/orc/orc_split_elim.orc"] allCsvFiles = ["/datasets/orc_parser/csv/TestOrcFile.testDate1900.csv", "/datasets/orc_parser/csv/TestOrcFile.testDate2038.csv", "/datasets/orc_parser/csv/orc_split_elim.csv"] for fIndex in range(len(allOrcFiles)): url_orc = "hdfs://{0}{1}".format(hdfs_name_node, allOrcFiles[fIndex]) url_csv = "hdfs://{0}{1}".format(hdfs_name_node, allCsvFiles[fIndex]) h2oOrc = h2o.import_file(url_orc) h2oCsv = h2o.import_file(url_csv) # compare the two frames assert pyunit_utils.compare_frames(h2oOrc, h2oCsv, numElements2Compare, tol_time, tol_numeric), \ "H2O frame parsed from orc and csv files are different!" else: raise EnvironmentError
def import_folder(): tol_time = 200 # comparing in ms or ns tol_numeric = 1e-5 # tolerance for comparing other numeric fields numElements2Compare = 100 # choose number of elements per column to compare. Save test time. # compressed the whole directory of files. multi_file_gzip_comp = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/hexdev_497/milsongs_csv.zip")) # directory containing the gzip version of csv files here. multi_file_csv = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/parser/hexdev_497/milsongs_csv_gzip")) try: # make sure the two agrees assert pyunit_utils.compare_frames(multi_file_csv, multi_file_gzip_comp, numElements2Compare, tol_time, tol_numeric, True), "H2O frame parsed from multiple orc and single orc " \ "files are different!" except: # in case the files are listed differently, we can always just check to see if the summary agrees. multi_file_gzip_comp.summary() zip_summary = h2o.frame(multi_file_gzip_comp.frame_id)["frames"][0]["columns"] multi_file_csv.summary() csv_summary = h2o.frame(multi_file_csv.frame_id)["frames"][0]["columns"] pyunit_utils.compare_frame_summary(zip_summary, csv_summary)
def group_by_all(): """ This is a comprehenisve test that will test all aggregations in the groupBy class. """ generate_dict_answers() # generate answer dictionary # perform group-by with datasets containing no NAs. All three na mode should produce same results h2o_iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) result_all = perform_group_by(h2o_iris,'all') result_ignore = perform_group_by(h2o_iris,'ignore') result_rm = perform_group_by(h2o_iris, 'rm') # make sure return type of get_frame() is H2OFrame assert_is_type(result_all, H2OFrame) assert_is_type(result_ignore, H2OFrame) assert_is_type(result_rm, H2OFrame) # make sure the result frame contains the correct number of rows and columns assert result_all.shape==result_ignore.shape==result_rm.shape==(3,30), "H2O group_by() command is not working." # check all group by results are the same assert pyunit_utils.compare_frames(result_all, result_ignore, 0, 0, 1e-6, strict=True, compare_NA=False), \ "H2O group_by() command is not working." assert pyunit_utils.compare_frames(result_ignore, result_rm, 0, 0, 1e-6, strict=True, compare_NA=False), \ "H2O group_by() command is not working." # check group by result with known correct result assert_group_by_result(result_all, g_iris_setosa_sepal_len, "Iris-setosa") assert_group_by_result(result_rm, g_iris_versicolor_sepal_wid, "Iris-versicolor") assert_group_by_result(result_ignore, g_iris_virginica_petal_wid, "Iris-virginica") # perform group-by with datasets contain NAs. h2o_iris_NA = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader_NA_2.csv")) result_all_NA = perform_group_by(h2o_iris_NA,'all') result_ignore_NA = perform_group_by(h2o_iris_NA,'ignore') result_rm_NA = perform_group_by(h2o_iris_NA, 'rm') # make sure return type of get_frame() is H2OFrame assert_is_type(result_all_NA, H2OFrame) assert_is_type(result_ignore_NA, H2OFrame) assert_is_type(result_rm_NA, H2OFrame) # make sure the result frame contains the correct number of rows and columns assert result_all_NA.shape==result_ignore_NA.shape==result_rm_NA.shape==(3,30), \ "H2O group_by() command is not working." # column petal_wid contains no NA and hence should provide same result as before independent of NA treatment assert pyunit_utils.compare_frames(result_all_NA[list(g_iris_virginica_petal_wid.keys())], result_rm_NA[list(g_iris_virginica_petal_wid.keys())], 0, 0, 1e-6, strict=False, compare_NA=False), "H2O group_by() command is not working." assert pyunit_utils.compare_frames(result_all_NA[list(g_iris_virginica_petal_wid.keys())], result_ignore_NA[list(g_iris_virginica_petal_wid.keys())], 0, 0, 1e-6, strict=False, compare_NA=False), "H2O group_by() command is not working." assert_group_by_result(result_all_NA, g_iris_virginica_petal_wid, "Iris-virginica") # check to make sure result_all_NA columns for sepal_len, sepal_wid, petal_len are all NAs for na='all' assert_all_NAs(result_all_NA, list(g_iris_setosa_sepal_len.keys())) # check sepal_len assert_all_NAs(result_all_NA, list(g_iris_versicolor_sepal_wid.keys())) # check sepal_wid assert_all_NAs(result_all_NA, list(g_iris_versicolor_petal_len_NA_ignore.keys())) # check petal_len # check to make sure na="ignore", and na="rm" are calculated correctly against known answers assert_group_by_result(result_ignore_NA, g_iris_versicolor_petal_len_NA_ignore, "Iris-versicolor") assert_group_by_result(result_rm_NA, g_iris_versicolor_petal_len_NA_rm, "Iris-versicolor")