Exemple #1
0
def test_id_col_check():
    d = pandas.DataFrame({"x": ["a", "b", "c"], "y": ["a", "b", "b"]})

    transform = vtreat.UnsupervisedTreatment(var_list=["x", "y"])

    with pytest.warns(Warning):
        transform.fit_transform(d)
Exemple #2
0
def test_unsupervised():
    numpy.random.seed(235)
    zip = ["z" + str(i + 1).zfill(5) for i in range(15)]
    d = pandas.DataFrame({"zip": numpy.random.choice(zip, size=1000)})
    d["const"] = 1
    d["const2"] = "b"
    d["const3"] = None

    transform = vtreat.UnsupervisedTreatment(
        params=vtreat.unsupervised_parameters({"indicator_min_fraction": 0.01
                                               }))

    d_treated = transform.fit_transform(d)

    for c in d_treated.columns:
        assert vtreat.util.can_convert_v_to_numeric(d_treated[c])
        assert numpy.sum(vtreat.util.is_bad(d_treated[c])) == 0

    sf = transform.score_frame_
    assert set(sf["orig_variable"]) == {"zip"}

    # https://stackoverflow.com/a/45671804/6901725
    with pytest.warns(None) as record:
        d_treated_2 = transform.transform(d)
    assert len(record) == 0
    assert d_treated.equals(d_treated_2)
    fn = transform.get_feature_names()
    assert set(sf["variable"]) == set(fn)
Exemple #3
0
def test_vtreat_onehot():
    d = pd.DataFrame(
        {
            "xc": ["a", "b", "b"],
            "xd": ["1", "1", "2"],  # vtreat picks columns to convert by type
            "xn": [1.0, 2.0, 3.0],
        }
    )

    treatment = vtreat.UnsupervisedTreatment(
        params=vtreat.unsupervised_parameters(
            {"coders": {"clean_copy", "indicator_code"}}
        )
    )
    treatment.fit(d)
    res = treatment.transform(d)

    expect = pd.DataFrame(
        {
            "xn": [1.0, 2.0, 3.0],
            "xd_lev_1": [1.0, 1.0, 0.0],
            "xd_lev_2": [0.0, 0.0, 1.0],
            "xc_lev_b": [0.0, 1.0, 1.0],
            "xc_lev_a": [1.0, 0.0, 0.0],
        }
    )

    assert data_algebra.test_util.equivalent_frames(res, expect, check_row_order=True)
Exemple #4
0
def test_unsupervised():
    n_rows = 10000
    n_levels = 10
    n_cat = 10
    n_numeric = 10
    numpy.random.seed(235)
    zip = ["z" + str(i + 1).zfill(5) for i in range(n_levels)]
    d = pandas.DataFrame({"const": numpy.zeros(n_rows) + 1})
    d["const2"] = "b"
    for i in range(n_cat):
        d[f"zip_{i}"] = numpy.random.choice(zip, size=n_rows)
    for i in range(n_numeric):
        d[f"num_{i}"] = numpy.random.uniform(size=n_rows)

    transform = vtreat.UnsupervisedTreatment(
        params=vtreat.unsupervised_parameters({"indicator_min_fraction": 0.01
                                               }))

    ## https://docs.python.org/3/library/profile.html
    # import cProfile
    # cProfile.run('d_treated = transform.fit_transform(d)')
    d_treated = transform.fit_transform(d)

    for c in d_treated.columns:
        assert vtreat.util.can_convert_v_to_numeric(d_treated[c])
        assert numpy.sum(vtreat.util.is_bad(d_treated[c])) == 0

    sf = transform.score_frame_
Exemple #5
0
def test_col_dups_1():
    d = pandas.DataFrame({"x": [1], "x2": [2], "y": [3]})
    d.columns = ["x", "x", "y"]

    transform = vtreat.UnsupervisedTreatment(var_list=["x"],
                                             cols_to_copy=["y"])

    with pytest.raises(ValueError):
        transform.fit_transform(d, d["y"])
def test_col_dups_1():
    d = pandas.DataFrame({'x': [1], 'x2': [2], 'y': [3]})
    d.columns = ['x', 'x', 'y']

    transform = vtreat.UnsupervisedTreatment(var_list=['x'],
                                             cols_to_copy=['y'])

    with pytest.raises(ValueError):
        transform.fit_transform(d, d["y"])
def test_id_col_check():
    d = pandas.DataFrame({'x': ['a', 'b', 'c'], 'y': ['a', 'b', 'b']})

    transform = vtreat.UnsupervisedTreatment(
        var_list=['x', 'y']
    )

    with pytest.warns(Warning):
        transform.fit_transform(d)
Exemple #8
0
def test_xgboost_col_name_issue_2():
    # https://stackoverflow.com/questions/48645846/pythons-xgoost-valueerrorfeature-names-may-not-contain-or
    # ValueError('feature_names may not contain [, ] or <')
    d = pandas.DataFrame({"x": ["[", "]", "<", "_lt_", "_lt_"]})

    transform = vtreat.UnsupervisedTreatment(var_list=["x"])
    d_transformed = transform.fit_transform(d, None)
    cols = d_transformed.columns
    for col in cols:
        assert not any(c in col for c in "[]<>")
    assert len(set(cols)) == len(cols)
def test_unsupervised():
    numpy.random.seed(235)
    zip = ["z" + str(i + 1).zfill(5) for i in range(15)]
    d = pandas.DataFrame({"zip": numpy.random.choice(zip, size=1000)})
    d["const"] = 1
    d["const2"] = "b"

    transform = vtreat.UnsupervisedTreatment(
        params=vtreat.unsupervised_parameters({"indicator_min_fraction": 0.01
                                               }))

    d_treated = transform.fit_transform(d)

    for c in d_treated.columns:
        assert vtreat.util.can_convert_v_to_numeric(d_treated[c])
        assert sum(vtreat.util.is_bad(d_treated[c])) == 0

    sf = transform.score_frame_
def test_homes_example():
    dir_path = os.path.dirname(os.path.realpath(__file__))
    d = pandas.read_pickle(os.path.join(dir_path, 'homes_76.pkl'))
    assert d.shape[0] == 38
    assert d.shape[1] == 8

    # from AI200: day_01/02_Regression/Part2_LRPractice/LRExample.ipynb
    # documentation: https://github.com/WinVector/pyvtreat/blob/main/Examples/Unsupervised/Unsupervised.md
    treatment = vtreat.UnsupervisedTreatment(
        cols_to_copy=['Price'],
        params=vtreat.unsupervised_parameters({
            'sparse_indicators': False,
            'coders': {'clean_copy', 'indicator_code', 'missing_indicator'}
        }))
    df = treatment.fit_transform(d)

    assert df.shape[0] == d.shape[0]
    expect_cols = [
        'Price', 'Size', 'Bath', 'Bed', 'Year', 'Garage', 'Lot_lev_4',
        'Lot_lev_5', 'Lot_lev_3', 'Lot_lev_1', 'Lot_lev_2', 'Lot_lev_11',
        'Elem_lev_edge', 'Elem_lev_edison', 'Elem_lev_parker',
        'Elem_lev_harris', 'Elem_lev_adams', 'Elem_lev_crest'
    ]
    assert set(df.columns) == set(expect_cols)
def test_imputation_controls():

    d = pandas.DataFrame({
        "x": [0, 1, 1000, None],
        "y": [0, 0, 1, 1],
    })

    transform = vtreat.UnsupervisedTreatment(cols_to_copy=["y"], )
    d_treated = transform.fit_transform(d)
    expect = pandas.DataFrame({
        "y": [0, 0, 1, 1],
        "x_is_bad": [0.0, 0.0, 0.0, 1.0],
        "x": [0.0, 1.0, 1000.0, 333.6666666667],
    })
    vtreat.util.check_matching_numeric_frames(res=d_treated, expect=expect)

    transform = vtreat.UnsupervisedTreatment(
        cols_to_copy=["y"],
        params=vtreat.unsupervised_parameters({
            "missingness_imputation":
            numpy.median,
        }),
    )
    d_treated = transform.fit_transform(d)
    expect = pandas.DataFrame({
        "y": [0, 0, 1, 1],
        "x_is_bad": [0.0, 0.0, 0.0, 1.0],
        "x": [0.0, 1.0, 1000.0, 1.0],
    })
    vtreat.util.check_matching_numeric_frames(res=d_treated, expect=expect)

    transform = vtreat.UnsupervisedTreatment(
        cols_to_copy=["y"],
        params=vtreat.unsupervised_parameters({
            "missingness_imputation":
            numpy.min,
        }),
    )
    d_treated = transform.fit_transform(d)
    expect = pandas.DataFrame({
        "y": [0, 0, 1, 1],
        "x_is_bad": [0.0, 0.0, 0.0, 1.0],
        "x": [0.0, 1.0, 1000.0, 0.0],
    })
    vtreat.util.check_matching_numeric_frames(res=d_treated, expect=expect)

    transform = vtreat.UnsupervisedTreatment(
        cols_to_copy=["y"],
        params=vtreat.unsupervised_parameters({
            "missingness_imputation": 7,
        }),
        imputation_map={"y": numpy.median},
    )
    d_treated = transform.fit_transform(d)
    expect = pandas.DataFrame({
        "y": [0, 0, 1, 1],
        "x_is_bad": [0.0, 0.0, 0.0, 1.0],
        "x": [0.0, 1.0, 1000.0, 7.0],
    })
    vtreat.util.check_matching_numeric_frames(res=d_treated, expect=expect)

    transform = vtreat.UnsupervisedTreatment(
        cols_to_copy=["y"],
        params=vtreat.unsupervised_parameters({
            "missingness_imputation": 7,
        }),
        imputation_map={"x": numpy.median},
    )
    d_treated = transform.fit_transform(d)
    expect = pandas.DataFrame({
        "y": [0, 0, 1, 1],
        "x_is_bad": [0.0, 0.0, 0.0, 1.0],
        "x": [0.0, 1.0, 1000.0, 1.0],
    })
    vtreat.util.check_matching_numeric_frames(res=d_treated, expect=expect)

    transform = vtreat.UnsupervisedTreatment(
        cols_to_copy=["y"],
        params=vtreat.unsupervised_parameters({
            "missingness_imputation":
            numpy.mean,
        }),
        imputation_map={"x": 12},
    )
    d_treated = transform.fit_transform(d)
    expect = pandas.DataFrame({
        "y": [0, 0, 1, 1],
        "x_is_bad": [0.0, 0.0, 0.0, 1.0],
        "x": [0.0, 1.0, 1000.0, 12.0],
    })
    vtreat.util.check_matching_numeric_frames(res=d_treated, expect=expect)