コード例 #1
0
def test_transform():
    valid_values = ["none", "standardize", "normalize", "demean", "descale"]
    df = h2o.create_frame(rows=100,
                          cols=4,
                          categorical_fraction=0.4,
                          integer_fraction=0,
                          binary_fraction=0,
                          real_range=100,
                          integer_range=100,
                          missing_fraction=0,
                          seed=1234)
    model = H2OAggregatorEstimator(target_num_exemplars=5)
    try:
        for val in valid_values:
            model.transform = val
            model.train(training_frame=df)
    except:
        assert False, "Aggregator model should be able to process all valid transform values"

    # Try with invalid value
    try:
        model = H2OAggregatorEstimator(target_num_exemplars=5,
                                       transform="some_invalid_value")
        assert False, "Passing invalid value of transform should throw an error"
    except:
        pass
コード例 #2
0
def test_binary():
    df = h2o.create_frame(
        rows=1000,
        cols=10,
        categorical_fraction=0.6,
        integer_fraction=0,
        binary_fraction=0,
        real_range=100,
        integer_range=100,
        missing_fraction=0.1,
        factors=5,
        seed=1234
    )
    params = {
        "target_num_exemplars": 100,
        "rel_tol_num_exemplars": 0.5,
        "categorical_encoding": "binary",
        "transform": "normalize"
    }
    agg = H2OAggregatorEstimator(**params)
    agg.train(training_frame=df)
    assert agg.aggregated_frame is not None, "Trained model should produce not empty aggregated frame"
    assert is_consistent(df.nrows, agg.aggregated_frame), \
        "Exemplar counts should sum up to number of training rows"
    assert correct_num_exemplars(agg.aggregated_frame, **params), \
        "Generated number of exemplars should match target value"
コード例 #3
0
def test_4995():
    data = h2o.import_file(DATASET)

    raised = False
    agg = H2OAggregatorEstimator(model_id="aggregator")
    try:
        agg.train(training_frame=data)
    except H2OResponseError:
        raised = True

    # H2OAggregatorEstimator.train raises error, when requesting wrong API version
    assert raised is False, "Local and Server versions of AggregatorEstimator should match!"
コード例 #4
0
def test_aggregator_effective_parameters():
    frame = h2o.create_frame(rows=10000,
                             cols=10,
                             categorical_fraction=0.6,
                             integer_fraction=0,
                             binary_fraction=0,
                             real_range=100,
                             integer_range=100,
                             missing_fraction=0,
                             factors=100,
                             seed=1234)

    agg1 = H2OAggregatorEstimator(target_num_exemplars=1000,
                                  rel_tol_num_exemplars=0.5,
                                  categorical_encoding="eigen")
    agg1.train(training_frame=frame)

    agg2 = H2OAggregatorEstimator(target_num_exemplars=1000,
                                  rel_tol_num_exemplars=0.5)
    agg2.train(training_frame=frame)

    assert agg2.parms['categorical_encoding']['input_value'] == "AUTO"
    assert agg2.parms['categorical_encoding']['actual_value'] == agg1.parms[
        'categorical_encoding']['actual_value']
コード例 #5
0
def test_cat_encoding():
    valid_values = [
        "auto",
        "enum",
        "one_hot_internal",
        "one_hot_explicit",
        "binary",
        "eigen",
        "label_encoder",
        "enum_limited",
        # "sort_by_response"    TODO: This is invalid parameter, remove it
    ]
    df = h2o.create_frame(rows=100,
                          cols=4,
                          categorical_fraction=0.4,
                          integer_fraction=0,
                          binary_fraction=0,
                          real_range=100,
                          integer_range=100,
                          missing_fraction=0,
                          seed=1234)
    model = H2OAggregatorEstimator(target_num_exemplars=5)
    try:
        for val in valid_values:
            model.categorical_encoding = val
            model.train(training_frame=df)
    except:
        assert False, "Aggregator model should be able to process all valid categorical_encoding values"

    # Try with invalid value
    try:
        model = H2OAggregatorEstimator(
            target_num_exemplars=5, categorical_encoding="some_invalid_value")
        assert False, "Passing invalid value of categorical_encoding should throw an error"
    except:
        pass
コード例 #6
0
def test_aggregator_get_mapping_frame():
    winequality_df = h2o.import_file(
        pyunit_utils.locate("smalldata/wine/winequality-redwhite.csv"))

    params = {
        "target_num_exemplars": 650,
        "rel_tol_num_exemplars": 0.25,
        "save_mapping_frame": True
    }
    agg = H2OAggregatorEstimator(ignored_columns=["quality", "type"], **params)
    agg.train(training_frame=winequality_df)

    mapping_frame = agg.mapping_frame

    assert mapping_frame.names == ["exemplar_assignment"]
    assert mapping_frame.nrows == winequality_df.nrows
コード例 #7
0
def test_num_of_exemplars(target_exemplars, tol):
    """Test whether number of generated exemplars corresponds to expected number +/- tolerance
    """
    df = h2o.create_frame(rows=10000,
                          cols=2,
                          categorical_fraction=0.1,
                          integer_fraction=0.3,
                          real_range=100,
                          seed=1234)

    agg = H2OAggregatorEstimator(target_num_exemplars=target_exemplars,
                                 rel_tol_num_exemplars=tol)
    agg.train(training_frame=df)
    assert agg.aggregated_frame is not None, "Trained model should produce not empty aggregated frame"
    assert (1-tol)*target_exemplars <= agg.aggregated_frame.nrows <= (1+tol)*target_exemplars, \
        "Final number of aggregated exemplars should be in equal to target number +/- tolerance"
コード例 #8
0
def test_all_params():
    data_path = pyunit_utils.locate(
        "smalldata/airlines/allyears2k_headers.zip")
    df = h2o.import_file(data_path)
    params = {
        "model_id": "agg",
        "training_frame": df,
        "response_column": "IsDepDelayed",
        "ignored_columns": ["UniqueCarrier"],
        "ignore_const_cols": False,
        "target_num_exemplars": 500,
        "rel_tol_num_exemplars": 0.3,
        "transform": "standardize",
        "categorical_encoding": "eigen"
    }
    try:
        model = H2OAggregatorEstimator(**params)
        model.train(training_frame=df)
    except:
        assert False, "Should not throw error on valid parameters"
コード例 #9
0
def test_low_cardinality_enum_limited():
    raw_data = [
        "1|2|A|A",
        "1|2|A|A",
        "1|2|A|A",
        "1|2|A|A",
        "1|2|A|A",
        "2|2|A|B",
        "2|2|A|A",
        "1|4|A|A",
        "1|2|B|A",
        "1|2|B|A",
        "1|2|A|A",
        "1|2|A|A",
        "4|5|C|A",
        "4|5|D|A",
        "2|5|D|A",
        "3|5|E|A",
        "4|5|F|A",
        "4|5|G|A",
        "4|5|H|A",
        "4|5|I|A",
        "4|5|J|A",
        "4|5|K|A",
        "4|5|L|A",
        "4|5|M|A",
        "4|5|N|A",
        "4|5|O|A",
        "4|5|P|A"
    ]
    raw_data = [el.split("|") for el in raw_data]
    df = H2OFrame(raw_data)
    agg = H2OAggregatorEstimator(target_num_exemplars=5, categorical_encoding="enum_limited")
    agg.train(training_frame=df)
    assert agg.aggregated_frame is not None, "Trained model should produce not empty aggregated frame"
    assert is_consistent(df.nrows, agg.aggregated_frame), "Exemplar counts should sum up to number of training rows"
    # from AggregatorTest.java
    assert agg.aggregated_frame.nrows == 7, "Number of exemplars of this test case should be 7"