def test_transform(): valid_values = ["none", "standardize", "normalize", "demean", "descale"] df = h2o.create_frame(rows=100, cols=4, categorical_fraction=0.4, integer_fraction=0, binary_fraction=0, real_range=100, integer_range=100, missing_fraction=0, seed=1234) model = H2OAggregatorEstimator(target_num_exemplars=5) try: for val in valid_values: model.transform = val model.train(training_frame=df) except: assert False, "Aggregator model should be able to process all valid transform values" # Try with invalid value try: model = H2OAggregatorEstimator(target_num_exemplars=5, transform="some_invalid_value") assert False, "Passing invalid value of transform should throw an error" except: pass
def test_binary(): df = h2o.create_frame( rows=1000, cols=10, categorical_fraction=0.6, integer_fraction=0, binary_fraction=0, real_range=100, integer_range=100, missing_fraction=0.1, factors=5, seed=1234 ) params = { "target_num_exemplars": 100, "rel_tol_num_exemplars": 0.5, "categorical_encoding": "binary", "transform": "normalize" } agg = H2OAggregatorEstimator(**params) agg.train(training_frame=df) assert agg.aggregated_frame is not None, "Trained model should produce not empty aggregated frame" assert is_consistent(df.nrows, agg.aggregated_frame), \ "Exemplar counts should sum up to number of training rows" assert correct_num_exemplars(agg.aggregated_frame, **params), \ "Generated number of exemplars should match target value"
def test_4995(): data = h2o.import_file(DATASET) raised = False agg = H2OAggregatorEstimator(model_id="aggregator") try: agg.train(training_frame=data) except H2OResponseError: raised = True # H2OAggregatorEstimator.train raises error, when requesting wrong API version assert raised is False, "Local and Server versions of AggregatorEstimator should match!"
def test_aggregator_effective_parameters(): frame = h2o.create_frame(rows=10000, cols=10, categorical_fraction=0.6, integer_fraction=0, binary_fraction=0, real_range=100, integer_range=100, missing_fraction=0, factors=100, seed=1234) agg1 = H2OAggregatorEstimator(target_num_exemplars=1000, rel_tol_num_exemplars=0.5, categorical_encoding="eigen") agg1.train(training_frame=frame) agg2 = H2OAggregatorEstimator(target_num_exemplars=1000, rel_tol_num_exemplars=0.5) agg2.train(training_frame=frame) assert agg2.parms['categorical_encoding']['input_value'] == "AUTO" assert agg2.parms['categorical_encoding']['actual_value'] == agg1.parms[ 'categorical_encoding']['actual_value']
def test_cat_encoding(): valid_values = [ "auto", "enum", "one_hot_internal", "one_hot_explicit", "binary", "eigen", "label_encoder", "enum_limited", # "sort_by_response" TODO: This is invalid parameter, remove it ] df = h2o.create_frame(rows=100, cols=4, categorical_fraction=0.4, integer_fraction=0, binary_fraction=0, real_range=100, integer_range=100, missing_fraction=0, seed=1234) model = H2OAggregatorEstimator(target_num_exemplars=5) try: for val in valid_values: model.categorical_encoding = val model.train(training_frame=df) except: assert False, "Aggregator model should be able to process all valid categorical_encoding values" # Try with invalid value try: model = H2OAggregatorEstimator( target_num_exemplars=5, categorical_encoding="some_invalid_value") assert False, "Passing invalid value of categorical_encoding should throw an error" except: pass
def test_aggregator_get_mapping_frame(): winequality_df = h2o.import_file( pyunit_utils.locate("smalldata/wine/winequality-redwhite.csv")) params = { "target_num_exemplars": 650, "rel_tol_num_exemplars": 0.25, "save_mapping_frame": True } agg = H2OAggregatorEstimator(ignored_columns=["quality", "type"], **params) agg.train(training_frame=winequality_df) mapping_frame = agg.mapping_frame assert mapping_frame.names == ["exemplar_assignment"] assert mapping_frame.nrows == winequality_df.nrows
def test_num_of_exemplars(target_exemplars, tol): """Test whether number of generated exemplars corresponds to expected number +/- tolerance """ df = h2o.create_frame(rows=10000, cols=2, categorical_fraction=0.1, integer_fraction=0.3, real_range=100, seed=1234) agg = H2OAggregatorEstimator(target_num_exemplars=target_exemplars, rel_tol_num_exemplars=tol) agg.train(training_frame=df) assert agg.aggregated_frame is not None, "Trained model should produce not empty aggregated frame" assert (1-tol)*target_exemplars <= agg.aggregated_frame.nrows <= (1+tol)*target_exemplars, \ "Final number of aggregated exemplars should be in equal to target number +/- tolerance"
def test_all_params(): data_path = pyunit_utils.locate( "smalldata/airlines/allyears2k_headers.zip") df = h2o.import_file(data_path) params = { "model_id": "agg", "training_frame": df, "response_column": "IsDepDelayed", "ignored_columns": ["UniqueCarrier"], "ignore_const_cols": False, "target_num_exemplars": 500, "rel_tol_num_exemplars": 0.3, "transform": "standardize", "categorical_encoding": "eigen" } try: model = H2OAggregatorEstimator(**params) model.train(training_frame=df) except: assert False, "Should not throw error on valid parameters"
def test_low_cardinality_enum_limited(): raw_data = [ "1|2|A|A", "1|2|A|A", "1|2|A|A", "1|2|A|A", "1|2|A|A", "2|2|A|B", "2|2|A|A", "1|4|A|A", "1|2|B|A", "1|2|B|A", "1|2|A|A", "1|2|A|A", "4|5|C|A", "4|5|D|A", "2|5|D|A", "3|5|E|A", "4|5|F|A", "4|5|G|A", "4|5|H|A", "4|5|I|A", "4|5|J|A", "4|5|K|A", "4|5|L|A", "4|5|M|A", "4|5|N|A", "4|5|O|A", "4|5|P|A" ] raw_data = [el.split("|") for el in raw_data] df = H2OFrame(raw_data) agg = H2OAggregatorEstimator(target_num_exemplars=5, categorical_encoding="enum_limited") agg.train(training_frame=df) assert agg.aggregated_frame is not None, "Trained model should produce not empty aggregated frame" assert is_consistent(df.nrows, agg.aggregated_frame), "Exemplar counts should sum up to number of training rows" # from AggregatorTest.java assert agg.aggregated_frame.nrows == 7, "Number of exemplars of this test case should be 7"