def test_outcome_name_required():

    numpy.random.seed(235)
    d = pandas.DataFrame({"x": ["1", "1", "1", "2", "2", "2"]})
    y = [1, 2, 3, 4, 5, 6]

    transform = vtreat.NumericOutcomeTreatment(
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}))
    transform.fit_transform(d, y)
    with pytest.raises(Exception):
        transform.fit_transform(d)

    transform = vtreat.BinomialOutcomeTreatment(
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}),
        outcome_target=3,
    )
    transform.fit_transform(d, y)
    with pytest.raises(Exception):
        transform.fit_transform(d)

    transform = vtreat.vtreat_api.MultinomialOutcomeTreatment(
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}))
    transform.fit_transform(d, y)
    with pytest.raises(Exception):
        transform.fit_transform(d)
Esempio n. 2
0
def test_classification():
    numpy.random.seed(46546)

    def make_data(nrows):
        d = pandas.DataFrame({"x": [0.1 * i for i in range(nrows)]})
        d["y"] = d["x"] + numpy.sin(
            d["x"]) + 0.1 * numpy.random.normal(size=d.shape[0])
        d["xc"] = ["level_" + str(5 * numpy.round(yi / 5, 1)) for yi in d["y"]]
        d["x2"] = numpy.random.normal(size=d.shape[0])
        d.loc[d["xc"] == "level_-1.0",
              "xc"] = numpy.nan  # introduce a nan level
        d["yc"] = d["y"] > 0.5
        return d

    d = make_data(500)
    vars = [c for c in d.columns if c not in set(["y", "yc"])]
    d_test = make_data(100)

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_name="yc",  # outcome variable
        outcome_target=True,  # outcome of interest
        cols_to_copy=[
            "y"
        ],  # columns to "carry along" but not treat as input variables
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}),
    )
    d_prepared = transform.fit_transform(d[vars], d["yc"])

    # show vars are under control
    assert transform.get_result_restriction() is None
    assert "x2" in set(d_prepared.columns)

    transform.set_result_restriction(["xc_logit_code", "x2"])
    dt_prepared = transform.transform(d_test)
    assert set(dt_prepared.columns) == set(["y", "yc", "x2", "xc_logit_code"])

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_name="yc",  # outcome variable
        outcome_target=True,  # outcome of interest
        cols_to_copy=[
            "y"
        ],  # columns to "carry along" but not treat as input variables
        params=vtreat.vtreat_parameters({"filter_to_recommended": True}),
    )
    d_prepared = transform.fit_transform(d[vars], d["yc"])

    assert transform.get_result_restriction() is not None
    assert "x2" not in transform.get_result_restriction()
    assert "x2" not in set(d_prepared.columns)

    transform.set_result_restriction(["xc_logit_code", "x2"])
    dt_prepared = transform.transform(d_test)
    assert set(dt_prepared.columns) == set(["y", "yc", "x2", "xc_logit_code"])
Esempio n. 3
0
def test_nan_inf():
    numpy.random.seed(235)
    d = pandas.DataFrame({
        "x": [1.0, numpy.nan, numpy.inf, -numpy.inf, None, 0],
        "y": [1, 2, 3, 4, 5, 6]
    })

    transform = vtreat.NumericOutcomeTreatment(
        outcome_name="y",
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}),
    )

    d_treated = transform.fit_transform(d, d["y"])

    for c in d_treated.columns:
        assert vtreat.util.can_convert_v_to_numeric(d_treated[c])
        assert numpy.sum(vtreat.util.is_bad(d_treated[c])) == 0

    expect = pandas.DataFrame({
        "x": [1.0, 0.5, 0.5, 0.5, 0.5, 0],
        "x_is_bad": [0, 1, 1, 1, 1, 0],
        "y": [1, 2, 3, 4, 5, 6],
    })

    for c in expect.columns:
        ec = numpy.asarray(expect[c])
        ed = numpy.asarray(d_treated[c])
        assert numpy.max(numpy.abs(ec - ed)) < 1.0e-6
Esempio n. 4
0
def test_r1_issue():
    plan = vtreat.NumericOutcomeTreatment(
        outcome_name="y",
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}),
    )

    # from https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html
    df = pandas.DataFrame(
        numpy.random.randn(5, 3),
        index=["a", "c", "e", "f", "h"],
        columns=["one", "two", "three"],
    )
    df["four"] = "foo"
    df["five"] = df["one"] > 0
    df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"])
    df2.reset_index(inplace=True, drop=True)
    df2["y"] = range(df2.shape[0])
    df2.loc[3, "four"] = "blog"
    df2["const"] = 1

    vtreat.util.is_bad(df2["five"])
    prepped = plan.fit_transform(df2, df2["y"])  # used to raise an exception

    for c in prepped.columns:
        assert vtreat.util.can_convert_v_to_numeric(prepped[c])
        assert numpy.sum(vtreat.util.is_bad(prepped[c])) == 0
Esempio n. 5
0
def test_db_adapter_monster():
    outcome_name = "y"
    row_id_name = 'row_id'
    n_vars = 5

    def mk_data(n_rows: int = 100):
        step = 1 / np.sqrt(n_vars)
        cols = dict()
        y = np.random.normal(size=n_rows)
        for i in range(n_vars):
            vname = f"v_{i}"
            v = np.random.choice(["a", "b"], replace=True, size=n_rows)
            y = y + np.where(v == "a", step, -step)
            cols[vname] = v
        vars = list(cols.keys())
        vars.sort()
        cols[outcome_name] = y
        cols[row_id_name] = range(n_rows)
        d = pd.DataFrame(cols)
        return d, vars

    d, vars = mk_data(100)
    d_app, _ = mk_data(10)
    cols_to_copy = [outcome_name, row_id_name]
    columns = vars + cols_to_copy

    treatment = vtreat.NumericOutcomeTreatment(
        cols_to_copy=cols_to_copy,
        outcome_name=outcome_name,
        params=vtreat.vtreat_parameters({
            "sparse_indicators": False,
            "filter_to_recommended": False,
        }),
    )
    d_train_treated = treatment.fit_transform(d)
    assert isinstance(d_train_treated, pd.DataFrame)
    d_app_treated = treatment.transform(d_app)

    transform_as_data = treatment.description_matrix()
    # transform_as_data.to_csv('example_transform.csv', index=False)

    ops = as_data_algebra_pipeline(
        source=descr(d_app=d),
        vtreat_descr=transform_as_data,
        treatment_table_name="transform_as_data",
        row_keys=[row_id_name],
    )

    ops_source = str(ops)
    assert isinstance(ops_source, str)

    d_app_res = ops.eval({
        "d_app": d_app,
        "transform_as_data": transform_as_data
    })
    assert data_algebra.test_util.equivalent_frames(d_app_treated, d_app_res)
    assert numpy.all([c in d_app_res.columns for c in cols_to_copy])
Esempio n. 6
0
def test_pipeparams():
    numpy.random.seed(2019)

    def make_data(nrows):
        d = pd.DataFrame({"x": 5 * numpy.random.normal(size=nrows)})
        d["y"] = numpy.sin(d["x"]) + 0.1 * numpy.random.normal(size=nrows)
        d.loc[numpy.arange(3, 10), "x"] = numpy.nan  # introduce a nan level
        d["xc"] = ["level_" + str(5 * numpy.round(yi / 5, 1)) for yi in d["y"]]
        d["x2"] = np.random.normal(size=nrows)
        d.loc[d["xc"] == "level_-1.0",
              "xc"] = numpy.nan  # introduce a nan level
        d["yc"] = d["y"] > 0.5
        return d

    df = make_data(500)

    df = df.drop(columns=["y"])

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_target=True,
        params=vtreat.vtreat_parameters({"sparse_indicators": False}),
    )

    clf = Pipeline(steps=[
        ("preprocessor", transform),
        ("classifier", LogisticRegression(solver="lbfgs")),
    ])

    X, y = df, df.pop("yc")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    clf.fit(X_train, y_train)

    #%%

    t_params = transform.get_params()
    assert t_params["indicator_min_fraction"] is not None
    assert transform.get_params()["indicator_min_fraction"] != 0

    #%%

    p_params = clf.get_params()
    assert p_params["preprocessor__indicator_min_fraction"] is not None

    #%%

    clf.set_params(preprocessor__indicator_min_fraction=0)
    assert transform.get_params()["indicator_min_fraction"] == 0

    # no warning or error
    with pytest.warns(None) as record:
        clf.fit(X_train, y_train)
    assert not record
Esempio n. 7
0
def test_pipeparams():
    numpy.random.seed(2019)

    def make_data(nrows):
        d = pd.DataFrame({'x': 5*numpy.random.normal(size=nrows)})
        d['y'] = numpy.sin(d['x']) + 0.1*numpy.random.normal(size=nrows)
        d.loc[numpy.arange(3, 10), 'x'] = numpy.nan                           # introduce a nan level
        d['xc'] = ['level_' + str(5*numpy.round(yi/5, 1)) for yi in d['y']]
        d['x2'] = np.random.normal(size=nrows)
        d.loc[d['xc']=='level_-1.0', 'xc'] = numpy.nan  # introduce a nan level
        d['yc'] = d['y']>0.5
        return d

    df = make_data(500)

    df = df.drop(columns=['y'])

    transform = vtreat.BinomialOutcomeTreatment(
        outcome_target=True,
        params=vtreat.vtreat_parameters({'sparse_indicators': False}))

    clf = Pipeline(steps=[
        ('preprocessor', transform),
        ('classifier', LogisticRegression(solver = 'lbfgs'))]
    )

    X, y = df, df.pop('yc')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    clf.fit(X_train, y_train)

    #%%

    t_params = transform.get_params()
    assert t_params['indicator_min_fraction'] is not None
    assert transform.get_params()['indicator_min_fraction'] != 0

    #%%

    p_params = clf.get_params()
    assert p_params['preprocessor__indicator_min_fraction'] is not None

    #%%

    clf.set_params(preprocessor__indicator_min_fraction=0)
    assert transform.get_params()['indicator_min_fraction'] == 0

    # no warning or error
    with pytest.warns(None) as record:
        clf.fit(X_train, y_train)
    assert not record
Esempio n. 8
0
def test_unexpected_nan():
    # confirm NaN processing correct, even when none seenin training data
    numpy.random.seed(235)
    d = pandas.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 2, 3, 4, 5, 6]})

    transform = vtreat.NumericOutcomeTreatment(
        outcome_name="y",
        params=vtreat.vtreat_parameters({"filter_to_recommended": False}),
    )

    d_treated = transform.fit_transform(d, d["y"])
    assert transform.score_frame_.shape[0] == 1
    assert "x" in set(transform.score_frame_["variable"])

    d_app = pandas.DataFrame({"x": [1, 2, numpy.NAN, 4, None, 6]})
    assert numpy.any(numpy.isnan(d_app["x"]))
    d_app_treated = transform.transform(d_app)
    assert not numpy.any(numpy.isnan(d_app_treated["x"]))
Esempio n. 9
0
def test_db_adapter_1_cdata():
    # Example from:
    # https://github.com/WinVector/pyvtreat/blob/main/Examples/Database/vtreat_db_adapter.ipynb
    # Data from:
    # https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008

    # data_all = pd.read_csv("diabetes_head.csv")
    dir_path = os.path.dirname(os.path.realpath(__file__))
    data_all = pd.read_csv(os.path.join(dir_path, "diabetes_head.csv"))
    n = data_all.shape[0]
    data_all["orig_index"] = range(n)
    d_train = data_all.loc[range(n - 5), :].reset_index(inplace=False,
                                                        drop=True)
    d_app = data_all.loc[range(n - 5, n)].reset_index(inplace=False, drop=True)

    #%%

    outcome_name = "readmitted"
    cols_to_copy = ["orig_index", "encounter_id", "patient_nbr"
                    ] + [outcome_name]
    vars = ["time_in_hospital", "weight"]
    columns = vars + cols_to_copy

    # d_train.loc[:, columns]

    #%%

    treatment = vtreat.BinomialOutcomeTreatment(
        cols_to_copy=cols_to_copy,
        outcome_name=outcome_name,
        outcome_target=True,
        params=vtreat.vtreat_parameters({
            "sparse_indicators": False,
            "filter_to_recommended": False,
        }),
    )
    d_train_treated = treatment.fit_transform(d_train.loc[:, columns])

    d_app_treated = treatment.transform(d_app.loc[:, columns])

    # d_app_treated

    #%%

    transform_as_data = treatment.description_matrix()

    # transform_as_data

    #%%

    ops = as_data_algebra_pipeline(
        source=descr(d_app=d_app.loc[:, columns]),
        vtreat_descr=transform_as_data,
        treatment_table_name="transform_as_data",
        row_keys=['orig_index'],
    )

    # print(ops)

    #%%

    transformed = ops.eval({
        "d_app": d_app.loc[:, columns],
        "transform_as_data": transform_as_data
    })

    # transformed

    #%%

    assert data_algebra.test_util.equivalent_frames(transformed, d_app_treated)

    #%%

    db_handle = data_algebra.SQLite.example_handle()

    sql = db_handle.to_sql(ops)
    assert isinstance(sql, str)
    # print(sql)

    #%%

    db_handle.insert_table(d_app.loc[:, columns], table_name="d_app")
    db_handle.insert_table(transform_as_data, table_name="transform_as_data")

    db_handle.execute("CREATE TABLE res AS " + sql)

    res_db = db_handle.read_query(
        "SELECT * FROM res ORDER BY orig_index LIMIT 10")

    # res_db

    #%%

    assert data_algebra.test_util.equivalent_frames(res_db, d_app_treated)

    #%%

    db_handle.close()
Esempio n. 10
0
def test_db_adapter_general():

    # set up example data
    def mk_data(
        n_rows: int = 100,
        *,
        outcome_name: str = "y",
        n_cat_vars: int = 5,
        n_num_vars: int = 5,
        add_unknowns: bool = False,
    ):
        step = 1 / np.sqrt(n_cat_vars + n_num_vars)
        cols = dict()
        y = np.random.normal(size=n_rows)
        for i in range(n_cat_vars):
            vname = f"vc_{i}"
            levels = ["a", "b", "c", "none"]
            if add_unknowns:
                levels = levels + ["d"]
            level_values = {
                v: step * np.random.normal(size=1)[0]
                for v in levels
            }
            v = np.random.choice(levels, replace=True, size=n_rows)
            y = y + np.array([level_values[vi] for vi in v])
            v = np.array([vi if vi != "none" else None for vi in v])
            cols[vname] = v
        for i in range(n_num_vars):
            vname = f"vn_{i}"
            v = np.random.normal(size=n_rows)
            y = y + step * v
            v[np.random.uniform(size=n_rows) < 0.24] = None
            cols[vname] = v

        vars = list(cols.keys())
        vars.sort()
        cols[outcome_name] = y
        d = pd.DataFrame(cols)
        d["orig_index"] = range(d.shape[0])
        return d, outcome_name, vars

    d, outcome_name, vars = mk_data(100)
    d_app, _, _ = mk_data(50, add_unknowns=True)
    cols_to_copy = [outcome_name, "orig_index"]
    columns = vars + cols_to_copy

    # get reference result
    treatment = vtreat.NumericOutcomeTreatment(
        cols_to_copy=cols_to_copy,
        outcome_name=outcome_name,
        params=vtreat.vtreat_parameters({
            "sparse_indicators": False,
            "filter_to_recommended": False,
        }),
    )
    d_train_treated = treatment.fit_transform(d)
    assert isinstance(d_train_treated, pd.DataFrame)
    d_app_treated = treatment.transform(d_app)

    # test ops path
    transform_as_data = treatment.description_matrix()
    ops = as_data_algebra_pipeline(
        source=descr(d_app=d),
        vtreat_descr=transform_as_data,
        treatment_table_name="transform_as_data",
        row_keys=["orig_index"],
    )
    ops_source = str(ops)
    assert isinstance(ops_source, str)
    d_app_res = ops.eval({
        "d_app": d_app,
        "transform_as_data": transform_as_data
    })
    assert data_algebra.test_util.equivalent_frames(d_app_treated, d_app_res)

    # test ops db path
    source_descr = TableDescription(
        table_name="d_app",
        column_names=columns,
    )
    db_handle = data_algebra.SQLite.example_handle()
    db_handle.insert_table(d_app.loc[:, columns], table_name="d_app")
    db_handle.insert_table(transform_as_data, table_name="transform_as_data")
    db_handle.execute("CREATE TABLE res AS " + db_handle.to_sql(ops))
    res_db = db_handle.read_query("SELECT * FROM res ORDER BY orig_index")
    assert data_algebra.test_util.equivalent_frames(res_db, d_app_treated)
    db_handle.close()
Esempio n. 11
0
def test_KDD2009_vtreat_1():
    data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            'KDD2009')
    test_on_BigQuery = False
    test_xicor = True
    # data from https://github.com/WinVector/PDSwR2/tree/master/KDD2009
    expect_test = pandas.read_csv(os.path.join(data_dir,
                                               'test_processed.csv.gz'),
                                  compression='gzip')
    d = pandas.read_csv(os.path.join(data_dir, 'orange_small_train.data.gz'),
                        sep='\t',
                        header=0)
    orig_vars = list(d.columns)
    # Read in dependent variable we are trying to predict.
    churn = pandas.read_csv(os.path.join(
        data_dir, 'orange_small_train_churn.labels.txt'),
                            header=None)
    churn.columns = ["churn"]
    churn['churn'] = churn['churn'] == 1  # replace with True / False
    # Arrange test/train split.
    numpy.random.seed(2020)
    n = d.shape[0]
    # https://github.com/WinVector/pyvtreat/blob/master/Examples/CustomizedCrossPlan/CustomizedCrossPlan.md
    split1 = vtreat.cross_plan.KWayCrossPlanYStratified().split_plan(
        n_rows=n, k_folds=10, y=churn.iloc[:, 0])
    train_idx = set(split1[0]['train'])
    is_train = [i in train_idx for i in range(n)]
    is_test = numpy.logical_not(is_train)
    d['orig_index'] = range(d.shape[0])
    d_train = d.loc[is_train, :].reset_index(drop=True, inplace=False)
    churn_train = numpy.asarray(churn.loc[is_train, :]["churn"])
    d_test = d.loc[is_test, :].reset_index(drop=True, inplace=False)
    churn_test = numpy.asarray(churn.loc[is_test, :]["churn"])
    # build treatment plan
    plan = vtreat.BinomialOutcomeTreatment(outcome_target=True,
                                           outcome_name='churn',
                                           cols_to_copy=['orig_index'],
                                           params=vtreat.vtreat_parameters({
                                               'filter_to_recommended':
                                               True,
                                               'sparse_indicators':
                                               True,
                                           }))
    cross_frame = plan.fit_transform(d_train, churn_train)
    test_processed = plan.transform(d_test)
    # check we got lots of variables, as seen in worksheet
    rec = plan.score_frame_.loc[plan.score_frame_.recommended, :]
    vc = rec.treatment.value_counts()
    treatments_seen = set(vc.index)
    assert numpy.all([
        t in treatments_seen for t in [
            'missing_indicator', 'indicator_code', 'logit_code',
            'prevalence_code', 'clean_copy'
        ]
    ])
    assert numpy.min(vc) >= 10
    model_vars = list(rec['variable'])
    if test_xicor:
        ## xicor
        # all_vars = list(set(plan.score_frame_["variable"]))
        all_vars = [
            c for c in cross_frame.columns if c not in ['churn', 'orig_index']
        ]
        xicor_scores = vtreat.stats_utils.xicor_for_frame(
            cross_frame.loc[:, all_vars],
            numpy.asarray(churn_train, dtype=float),
            n_reps=5)
        xicor_picked = list(xicor_scores.loc[xicor_scores['xicor'] > 0.0,
                                             'variable'])
        model_vars = xicor_picked
    # try a simple model
    model = sklearn.linear_model.LogisticRegression(max_iter=1000)
    with pytest.warns(UserWarning):  # densifying warns
        model.fit(cross_frame.loc[:, model_vars], churn_train)
    with pytest.warns(UserWarning):  # densifying warns
        preds_test = model.predict_proba(test_processed.loc[:, model_vars])
    with pytest.warns(UserWarning):  # densifying warns
        preds_train = model.predict_proba(cross_frame.loc[:, model_vars])
    fpr, tpr, _ = sklearn.metrics.roc_curve(churn_test, preds_test[:, 1])
    auc_test = sklearn.metrics.auc(fpr, tpr)
    fpr, tpr, _ = sklearn.metrics.roc_curve(churn_train, preds_train[:, 1])
    auc_train = sklearn.metrics.auc(fpr, tpr)
    assert auc_test > 0.6  # not good!
    assert abs(auc_test - auc_train) < 0.05  # at least not over fit!
    # check against previous result
    assert test_processed.shape == expect_test.shape
    assert set(test_processed.columns) == set(expect_test.columns)
    assert numpy.max(numpy.max(numpy.abs(test_processed - expect_test))) < 1e-3
    # test transform conversion
    transform_as_data = plan.description_matrix()
    incoming_vars = list(set(transform_as_data['orig_var']))
    ops = vtreat.vtreat_db_adapter.as_data_algebra_pipeline(
        source=TableDescription(table_name='d_test',
                                column_names=incoming_vars + ['orig_index']),
        vtreat_descr=transform_as_data,
        treatment_table_name='transform_as_data',
        row_keys=['orig_index'],
    )
    test_by_pipeline = ops.eval({
        'd_test':
        d_test.loc[:, incoming_vars + ['orig_index']],
        'transform_as_data':
        transform_as_data
    })
    assert test_by_pipeline.shape[0] == test_processed.shape[0]
    assert test_by_pipeline.shape[1] >= test_processed.shape[1]
    assert not numpy.any(numpy.isnan(test_by_pipeline))
    test_pipeline_cols = set(test_by_pipeline.columns)
    assert numpy.all([c in test_pipeline_cols for c in test_processed.columns])
    test_cols_sorted = list(test_processed.columns)
    test_cols_sorted.sort()
    assert numpy.max(
        numpy.max(
            numpy.abs(test_processed[test_cols_sorted] -
                      test_by_pipeline[test_cols_sorted]))) < 1e-5
    # data algebra pipeline in database
    sql = data_algebra.BigQuery.BigQueryModel().to_sql(ops)
    assert isinstance(sql, str)
    if test_on_BigQuery:
        db_handle = data_algebra.BigQuery.example_handle()
        db_handle.drop_table('d_test_processed')
        db_handle.insert_table(d_test.loc[:, incoming_vars + ['orig_index']],
                               table_name='d_test',
                               allow_overwrite=True)
        db_handle.insert_table(transform_as_data,
                               table_name='transform_as_data',
                               allow_overwrite=True)
        db_handle.execute(
            f"CREATE TABLE {db_handle.db_model.table_prefix}.d_test_processed AS {db_handle.to_sql(ops)}"
        )
        db_res = db_handle.read_query(
            f"SELECT * FROM {db_handle.db_model.table_prefix}.d_test_processed ORDER BY orig_index"
        )
        assert db_res.shape[0] == test_processed.shape[0]
        assert numpy.max(
            numpy.max(
                numpy.abs(test_processed[test_cols_sorted] -
                          db_res[test_cols_sorted]))) < 1e-5
        db_handle.drop_table('d_test')
        db_handle.drop_table('transform_as_data')
        db_handle.drop_table('d_test_processed')
        db_handle.close()
Esempio n. 12
0
def test_diabetes_example():
    dir_path = os.path.dirname(os.path.realpath(__file__))
    data = pandas.read_pickle(os.path.join(dir_path, 'diabetes_head.pkl'))
    assert data.shape[0] == 1000

    # from AI200: day_04/ZZ_homework/soln_dont_peek/diabetes_soln.ipynb

    # sklearn.preprocessing.OneHotEncoder could
    # also perform this task well.

    # documentation:
    #  https://github.com/WinVector/pyvtreat/blob/main/Examples/Classification/Classification.md
    treatment = vtreat.BinomialOutcomeTreatment(
        cols_to_copy=['encounter_id', 'patient_nbr', 'readmitted'],
        outcome_name='readmitted',
        outcome_target=True,
        params=vtreat.vtreat_parameters({
            'sparse_indicators': False,
            'filter_to_recommended': False,
        }),
    )
    data_treated = treatment.fit_transform(data)

    assert data_treated.shape[0] == data.shape[0]

    expect = {
        'A1Cresult_lev_None', 'A1Cresult_lev__gt_8', 'A1Cresult_logit_code',
        'A1Cresult_prevalence_code', 'acarbose_lev_No', 'acarbose_logit_code',
        'acarbose_prevalence_code', 'admission_source_id_lev_1',
        'admission_source_id_lev_7', 'admission_source_id_logit_code',
        'admission_source_id_prevalence_code', 'admission_type_id_lev_1',
        'admission_type_id_lev_2', 'admission_type_id_lev_6',
        'admission_type_id_logit_code', 'admission_type_id_prevalence_code',
        'age_lev__osq_40-50_cp_', 'age_lev__osq_50-60_cp_',
        'age_lev__osq_60-70_cp_', 'age_lev__osq_70-80_cp_',
        'age_lev__osq_80-90_cp_', 'age_logit_code', 'age_prevalence_code',
        'change_lev_Ch', 'change_lev_No', 'change_logit_code',
        'change_prevalence_code', 'chlorpropamide_lev_No',
        'chlorpropamide_logit_code', 'chlorpropamide_prevalence_code',
        'diabetesMed_lev_No', 'diabetesMed_lev_Yes', 'diabetesMed_logit_code',
        'diabetesMed_prevalence_code', 'diag_1_is_bad', 'diag_1_lev_414',
        'diag_1_logit_code', 'diag_1_prevalence_code', 'diag_2_is_bad',
        'diag_2_logit_code', 'diag_2_prevalence_code', 'diag_3_is_bad',
        'diag_3_lev_250', 'diag_3_logit_code', 'diag_3_prevalence_code',
        'discharge_disposition_id_lev_1', 'discharge_disposition_id_lev_25',
        'discharge_disposition_id_logit_code',
        'discharge_disposition_id_prevalence_code', 'encounter_id',
        'gender_lev_Female', 'gender_lev_Male', 'gender_logit_code',
        'gender_prevalence_code', 'glimepiride_lev_No',
        'glimepiride_logit_code', 'glimepiride_prevalence_code',
        'glipizide_lev_No', 'glipizide_lev_Steady', 'glipizide_logit_code',
        'glipizide_prevalence_code', 'glyburide_lev_No',
        'glyburide_logit_code', 'glyburide_prevalence_code',
        'insulin_lev_Down', 'insulin_lev_No', 'insulin_lev_Steady',
        'insulin_logit_code', 'insulin_prevalence_code',
        'max_glu_serum_lev_None', 'max_glu_serum_logit_code',
        'max_glu_serum_prevalence_code', 'medical_specialty_is_bad',
        'medical_specialty_lev_Cardiology',
        'medical_specialty_lev_Family/GeneralPractice',
        'medical_specialty_lev_InternalMedicine', 'medical_specialty_lev__NA_',
        'medical_specialty_logit_code', 'medical_specialty_prevalence_code',
        'metformin_lev_No', 'metformin_lev_Steady', 'metformin_logit_code',
        'metformin_prevalence_code', 'num_lab_procedures', 'num_medications',
        'num_procedures', 'number_diagnoses', 'number_emergency',
        'number_inpatient', 'number_outpatient', 'patient_nbr',
        'pioglitazone_lev_No', 'pioglitazone_logit_code',
        'pioglitazone_prevalence_code', 'race_is_bad',
        'race_lev_AfricanAmerican', 'race_lev_Caucasian', 'race_logit_code',
        'race_prevalence_code', 'readmitted', 'repaglinide_lev_No',
        'repaglinide_logit_code', 'repaglinide_prevalence_code', 'revisit',
        'rosiglitazone_lev_No', 'rosiglitazone_logit_code',
        'rosiglitazone_prevalence_code', 'time_in_hospital',
        'tolazamide_lev_No', 'tolazamide_logit_code',
        'tolazamide_prevalence_code', 'tolbutamide_lev_No',
        'tolbutamide_logit_code', 'tolbutamide_prevalence_code',
        'troglitazone_lev_No', 'troglitazone_logit_code',
        'troglitazone_prevalence_code', 'visit_number', 'weight_is_bad',
        'weight_lev__NA_', 'weight_logit_code', 'weight_prevalence_code'
    }
    assert set(data_treated.columns) == expect

    treatment = vtreat.BinomialOutcomeTreatment(
        cols_to_copy=['encounter_id', 'patient_nbr', 'readmitted'],
        outcome_name='readmitted',
        outcome_target=True,
        params=vtreat.vtreat_parameters({
            'sparse_indicators': False,
            'filter_to_recommended': True,
        }),
    )
    data_treated = treatment.fit_transform(data)

    assert data_treated.shape[0] == data.shape[0]
    assert data_treated.shape[1] >= 10
Esempio n. 13
0
def test_user_coders():
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")

        # avoid depending on sklearn.metrics.r2_score
        def r_squared(*, y_true, y_pred):
            y_true = numpy.asarray(y_true)
            y_pred = numpy.asarray(y_pred)
            return 1 - numpy.sum((y_true - y_pred) ** 2) / numpy.sum(
                (y_true - numpy.mean(y_true)) ** 2
            )

        # %%

        class PolyTransform(vtreat.transform.UserTransform):
            """a polynomial model"""

            def __init__(self, *, deg=5, alpha=0.1):
                vtreat.transform.UserTransform.__init__(self, treatment="poly")
                self.models_ = None
                self.deg = deg
                self.alpha = alpha

            def poly_terms(self, vname, vec):
                vec = numpy.asarray(vec)
                r = pandas.DataFrame({"x": vec})
                for d in range(1, self.deg + 1):
                    r[vname + "_" + str(d)] = vec ** d
                return r

            def fit(self, X, y):
                self.models_ = {}
                self.incoming_vars_ = []
                self.derived_vars_ = []
                for v in X.columns:
                    if vtreat.util.can_convert_v_to_numeric(X[v]):
                        X_v = self.poly_terms(v, X[v])
                        model_v = sklearn.linear_model.Ridge(alpha=self.alpha).fit(
                            X_v, y
                        )
                        new_var = v + "_poly"
                        self.models_[v] = (model_v, [c for c in X_v.columns], new_var)
                        self.incoming_vars_.append(v)
                        self.derived_vars_.append(new_var)
                return self

            def transform(self, X):
                r = pandas.DataFrame()
                for k, v in self.models_.items():
                    model_k = v[0]
                    cols_k = v[1]
                    new_var = v[2]
                    X_k = self.poly_terms(k, X[k])
                    xform_k = model_k.predict(X_k)
                    r[new_var] = xform_k
                return r

        # %%

        d = pandas.DataFrame({"x": [i for i in range(100)]})
        d["y"] = numpy.sin(0.2 * d["x"]) + 0.2 * numpy.random.normal(size=d.shape[0])
        d.head()

        # %%

        step = PolyTransform(deg=10)

        # %%

        fit = step.fit_transform(d[["x"]], d["y"])
        fit["x"] = d["x"]
        fit.head()

        # %%

        # seaborn.scatterplot(x='x', y='y', data=d)
        # seaborn.lineplot(x='x', y='x_poly', data=fit, color='red', alpha=0.5)

        # %%

        transform = vtreat.NumericOutcomeTreatment(
            outcome_name="y",
            params=vtreat.vtreat_parameters(
                {
                    "filter_to_recommended": False,
                    "user_transforms": [PolyTransform(deg=10)],
                }
            ),
        )

        # %%

        transform.fit(d, d["y"])

        # %%

        transform.score_frame_

        # %%

        x2_overfit = transform.transform(d)

        # %%
        # seaborn.scatterplot(x='x', y='y', data=x2_overfit)
        # seaborn.lineplot(x='x', y='x_poly', data=x2_overfit, color='red', alpha=0.5)

        # %%

        x2 = transform.fit_transform(d, d["y"])

        # %%

        transform.score_frame_

        # %%

        x2.head()
Esempio n. 14
0
labels = targets.map(dmap).fillna(1)
print(labels.value_counts())

variables.drop(columns=[
    'POST_PD_x', 'POST_PD_y', 'join', 'ICCE', 'PROVIDER_NAME', 'GENDER', 'NPI',
    'PROVIDER_REPORTING_SPECIALTY', 'PROVIDER_SPECIALTY'
],
               axis=0,
               inplace=True)
print(variables.columns.values)

## Data Prep for Train
plan = vt.BinomialOutcomeTreatment(outcome_target=True,
                                   params=vt.vtreat_parameters({
                                       'filter_to_recommended':
                                       False,
                                       'sparse_indicators':
                                       False
                                   }))

cross_frame = plan.fit_transform(variables, labels)
cross_frame.dtypes
cross_frame.shape
print(cross_frame)
## Split into Test/Train
train_features, test_features, train_labels, test_labels = train_test_split(
    cross_frame, labels, test_size=0.2, random_state=42, shuffle=True)
model_vars = np.asarray(
    plan.score_frame_['variable'][plan.score_frame_['recommended']])

rf = xg.XGBClassifier(objective='binary:logistic')