def test_nan_inf(): numpy.random.seed(235) d = pandas.DataFrame({ "x": [1.0, numpy.nan, numpy.inf, -numpy.inf, None, 0], "y": [1, 2, 3, 4, 5, 6] }) transform = vtreat.NumericOutcomeTreatment( outcome_name="y", params=vtreat.vtreat_parameters({"filter_to_recommended": False}), ) d_treated = transform.fit_transform(d, d["y"]) for c in d_treated.columns: assert vtreat.util.can_convert_v_to_numeric(d_treated[c]) assert numpy.sum(vtreat.util.is_bad(d_treated[c])) == 0 expect = pandas.DataFrame({ "x": [1.0, 0.5, 0.5, 0.5, 0.5, 0], "x_is_bad": [0, 1, 1, 1, 1, 0], "y": [1, 2, 3, 4, 5, 6], }) for c in expect.columns: ec = numpy.asarray(expect[c]) ed = numpy.asarray(d_treated[c]) assert numpy.max(numpy.abs(ec - ed)) < 1.0e-6
def test_outcome_name_required(): numpy.random.seed(235) d = pandas.DataFrame({"x": ["1", "1", "1", "2", "2", "2"]}) y = [1, 2, 3, 4, 5, 6] transform = vtreat.NumericOutcomeTreatment( params=vtreat.vtreat_parameters({"filter_to_recommended": False})) transform.fit_transform(d, y) with pytest.raises(Exception): transform.fit_transform(d) transform = vtreat.BinomialOutcomeTreatment( params=vtreat.vtreat_parameters({"filter_to_recommended": False}), outcome_target=3, ) transform.fit_transform(d, y) with pytest.raises(Exception): transform.fit_transform(d) transform = vtreat.vtreat_api.MultinomialOutcomeTreatment( params=vtreat.vtreat_parameters({"filter_to_recommended": False})) transform.fit_transform(d, y) with pytest.raises(Exception): transform.fit_transform(d)
def test_r1_issue(): plan = vtreat.NumericOutcomeTreatment( outcome_name="y", params=vtreat.vtreat_parameters({"filter_to_recommended": False}), ) # from https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html df = pandas.DataFrame( numpy.random.randn(5, 3), index=["a", "c", "e", "f", "h"], columns=["one", "two", "three"], ) df["four"] = "foo" df["five"] = df["one"] > 0 df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"]) df2.reset_index(inplace=True, drop=True) df2["y"] = range(df2.shape[0]) df2.loc[3, "four"] = "blog" df2["const"] = 1 vtreat.util.is_bad(df2["five"]) prepped = plan.fit_transform(df2, df2["y"]) # used to raise an exception for c in prepped.columns: assert vtreat.util.can_convert_v_to_numeric(prepped[c]) assert numpy.sum(vtreat.util.is_bad(prepped[c])) == 0
def test_db_adapter_monster(): outcome_name = "y" row_id_name = 'row_id' n_vars = 5 def mk_data(n_rows: int = 100): step = 1 / np.sqrt(n_vars) cols = dict() y = np.random.normal(size=n_rows) for i in range(n_vars): vname = f"v_{i}" v = np.random.choice(["a", "b"], replace=True, size=n_rows) y = y + np.where(v == "a", step, -step) cols[vname] = v vars = list(cols.keys()) vars.sort() cols[outcome_name] = y cols[row_id_name] = range(n_rows) d = pd.DataFrame(cols) return d, vars d, vars = mk_data(100) d_app, _ = mk_data(10) cols_to_copy = [outcome_name, row_id_name] columns = vars + cols_to_copy treatment = vtreat.NumericOutcomeTreatment( cols_to_copy=cols_to_copy, outcome_name=outcome_name, params=vtreat.vtreat_parameters({ "sparse_indicators": False, "filter_to_recommended": False, }), ) d_train_treated = treatment.fit_transform(d) assert isinstance(d_train_treated, pd.DataFrame) d_app_treated = treatment.transform(d_app) transform_as_data = treatment.description_matrix() # transform_as_data.to_csv('example_transform.csv', index=False) ops = as_data_algebra_pipeline( source=descr(d_app=d), vtreat_descr=transform_as_data, treatment_table_name="transform_as_data", row_keys=[row_id_name], ) ops_source = str(ops) assert isinstance(ops_source, str) d_app_res = ops.eval({ "d_app": d_app, "transform_as_data": transform_as_data }) assert data_algebra.test_util.equivalent_frames(d_app_treated, d_app_res) assert numpy.all([c in d_app_res.columns for c in cols_to_copy])
def test_unexpected_nan(): # confirm NaN processing correct, even when none seenin training data numpy.random.seed(235) d = pandas.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 2, 3, 4, 5, 6]}) transform = vtreat.NumericOutcomeTreatment( outcome_name="y", params=vtreat.vtreat_parameters({"filter_to_recommended": False}), ) d_treated = transform.fit_transform(d, d["y"]) assert transform.score_frame_.shape[0] == 1 assert "x" in set(transform.score_frame_["variable"]) d_app = pandas.DataFrame({"x": [1, 2, numpy.NAN, 4, None, 6]}) assert numpy.any(numpy.isnan(d_app["x"])) d_app_treated = transform.transform(d_app) assert not numpy.any(numpy.isnan(d_app_treated["x"]))
def test_classification(): numpy.random.seed(46546) def make_data(nrows): d = pandas.DataFrame({"x": [0.1 * i for i in range(500)]}) d["y"] = d["x"] + numpy.sin( d["x"]) + 0.1 * numpy.random.normal(size=d.shape[0]) d["xc"] = ["level_" + str(5 * numpy.round(yi / 5, 1)) for yi in d["y"]] d["x2"] = numpy.random.normal(size=d.shape[0]) d.loc[d["xc"] == "level_-1.0", "xc"] = numpy.nan # introduce a nan level d["yc"] = d["y"] > 0.5 return d d = make_data(5000) transform = vtreat.NumericOutcomeTreatment( outcome_name="y", # outcome variable cols_to_copy=[ "yc" ], # columns to "carry along" but not treat as input variables ) d_prepared = transform.fit_transform(d, d["y"]) for c in d_prepared.columns: assert vtreat.util.can_convert_v_to_numeric(d_prepared[c]) assert sum(vtreat.util.is_bad(d_prepared[c])) == 0 dtest = make_data(450) dtest_prepared = transform.transform(dtest) for c in dtest_prepared.columns: assert vtreat.util.can_convert_v_to_numeric(dtest_prepared[c]) assert sum(vtreat.util.is_bad(dtest_prepared[c])) == 0 sf = transform.score_frame_ xrow = sf.loc[numpy.logical_and(sf.variable == "x", sf.treatment == "clean_copy"), :] xrow.reset_index(inplace=True, drop=True) assert xrow.recommended[0]
def test_db_adapter_general(): # set up example data def mk_data( n_rows: int = 100, *, outcome_name: str = "y", n_cat_vars: int = 5, n_num_vars: int = 5, add_unknowns: bool = False, ): step = 1 / np.sqrt(n_cat_vars + n_num_vars) cols = dict() y = np.random.normal(size=n_rows) for i in range(n_cat_vars): vname = f"vc_{i}" levels = ["a", "b", "c", "none"] if add_unknowns: levels = levels + ["d"] level_values = { v: step * np.random.normal(size=1)[0] for v in levels } v = np.random.choice(levels, replace=True, size=n_rows) y = y + np.array([level_values[vi] for vi in v]) v = np.array([vi if vi != "none" else None for vi in v]) cols[vname] = v for i in range(n_num_vars): vname = f"vn_{i}" v = np.random.normal(size=n_rows) y = y + step * v v[np.random.uniform(size=n_rows) < 0.24] = None cols[vname] = v vars = list(cols.keys()) vars.sort() cols[outcome_name] = y d = pd.DataFrame(cols) d["orig_index"] = range(d.shape[0]) return d, outcome_name, vars d, outcome_name, vars = mk_data(100) d_app, _, _ = mk_data(50, add_unknowns=True) cols_to_copy = [outcome_name, "orig_index"] columns = vars + cols_to_copy # get reference result treatment = vtreat.NumericOutcomeTreatment( cols_to_copy=cols_to_copy, outcome_name=outcome_name, params=vtreat.vtreat_parameters({ "sparse_indicators": False, "filter_to_recommended": False, }), ) d_train_treated = treatment.fit_transform(d) assert isinstance(d_train_treated, pd.DataFrame) d_app_treated = treatment.transform(d_app) # test ops path transform_as_data = treatment.description_matrix() ops = as_data_algebra_pipeline( source=descr(d_app=d), vtreat_descr=transform_as_data, treatment_table_name="transform_as_data", row_keys=["orig_index"], ) ops_source = str(ops) assert isinstance(ops_source, str) d_app_res = ops.eval({ "d_app": d_app, "transform_as_data": transform_as_data }) assert data_algebra.test_util.equivalent_frames(d_app_treated, d_app_res) # test ops db path source_descr = TableDescription( table_name="d_app", column_names=columns, ) db_handle = data_algebra.SQLite.example_handle() db_handle.insert_table(d_app.loc[:, columns], table_name="d_app") db_handle.insert_table(transform_as_data, table_name="transform_as_data") db_handle.execute("CREATE TABLE res AS " + db_handle.to_sql(ops)) res_db = db_handle.read_query("SELECT * FROM res ORDER BY orig_index") assert data_algebra.test_util.equivalent_frames(res_db, d_app_treated) db_handle.close()
def test_user_coders(): with warnings.catch_warnings(): warnings.filterwarnings("ignore") # avoid depending on sklearn.metrics.r2_score def r_squared(*, y_true, y_pred): y_true = numpy.asarray(y_true) y_pred = numpy.asarray(y_pred) return 1 - numpy.sum((y_true - y_pred) ** 2) / numpy.sum( (y_true - numpy.mean(y_true)) ** 2 ) # %% class PolyTransform(vtreat.transform.UserTransform): """a polynomial model""" def __init__(self, *, deg=5, alpha=0.1): vtreat.transform.UserTransform.__init__(self, treatment="poly") self.models_ = None self.deg = deg self.alpha = alpha def poly_terms(self, vname, vec): vec = numpy.asarray(vec) r = pandas.DataFrame({"x": vec}) for d in range(1, self.deg + 1): r[vname + "_" + str(d)] = vec ** d return r def fit(self, X, y): self.models_ = {} self.incoming_vars_ = [] self.derived_vars_ = [] for v in X.columns: if vtreat.util.can_convert_v_to_numeric(X[v]): X_v = self.poly_terms(v, X[v]) model_v = sklearn.linear_model.Ridge(alpha=self.alpha).fit( X_v, y ) new_var = v + "_poly" self.models_[v] = (model_v, [c for c in X_v.columns], new_var) self.incoming_vars_.append(v) self.derived_vars_.append(new_var) return self def transform(self, X): r = pandas.DataFrame() for k, v in self.models_.items(): model_k = v[0] cols_k = v[1] new_var = v[2] X_k = self.poly_terms(k, X[k]) xform_k = model_k.predict(X_k) r[new_var] = xform_k return r # %% d = pandas.DataFrame({"x": [i for i in range(100)]}) d["y"] = numpy.sin(0.2 * d["x"]) + 0.2 * numpy.random.normal(size=d.shape[0]) d.head() # %% step = PolyTransform(deg=10) # %% fit = step.fit_transform(d[["x"]], d["y"]) fit["x"] = d["x"] fit.head() # %% # seaborn.scatterplot(x='x', y='y', data=d) # seaborn.lineplot(x='x', y='x_poly', data=fit, color='red', alpha=0.5) # %% transform = vtreat.NumericOutcomeTreatment( outcome_name="y", params=vtreat.vtreat_parameters( { "filter_to_recommended": False, "user_transforms": [PolyTransform(deg=10)], } ), ) # %% transform.fit(d, d["y"]) # %% transform.score_frame_ # %% x2_overfit = transform.transform(d) # %% # seaborn.scatterplot(x='x', y='y', data=x2_overfit) # seaborn.lineplot(x='x', y='x_poly', data=x2_overfit, color='red', alpha=0.5) # %% x2 = transform.fit_transform(d, d["y"]) # %% transform.score_frame_ # %% x2.head()