def test_onehot_task(): x1 = np.random.choice(["a", "b", "c"], size=1000) x2 = np.random.choice(["1", "2", "3", "4", "5", "6"], size=1000) x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) task = OneHot() res = task.fit_transform(data) assert res.column_names == [ "x1_0", "x1_1", "x1_2", "x2_0", "x2_1", "x2_2", "x2_3", "x2_4", "x2_5", ] assert all([x == ColumnType(VarType.NUM) for x in res.column_types]) expected = OneHotEncoder().fit_transform(data.X) assert np.all(np.isclose(res.X.todense(), expected.todense()))
def test_ordcat_task(use_other): x1 = np.random.choice(["a", "b", "c"], size=1000) x2 = np.random.choice(["1", "2", "3", "4", "5", "6"], size=1000) x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) task = OrdCat(min_support=0, use_other=use_other, handle_unknown="error") res = task.fit_transform(data) assert res.column_names == ["x1", "x2"] assert res.column_types == [ ColumnType(VarType.CAT, level=5 if use_other else 4), ColumnType(VarType.CAT, level=8 if use_other else 7), ] expected = OrdinalEncoder().fit_transform(data.X) if use_other: expected = expected + 2 else: expected = expected + 1 assert np.all(np.isclose(res.X, expected))
def test_pipeline_numeric(): df = create_dataset(num=10, cat=0, size=5000) train_df = df.iloc[:-1000] y_train = train_df.pop("target").values test_df = df.iloc[-1000:] y_test = test_df.pop("target").values num_pipeline = Pipeline(steps=[ Step(name="impute", task=Wrap(SimpleImputer(strategy="mean"))), Step( name="standatize", task=Wrap(StandardScaler(), type_override=VarType.NUM), ), ]) train = num_pipeline.fit_transform(to_task_data(train_df, y_train)) test = num_pipeline.transform(to_task_data(test_df, y_test)) assert all([x == ColumnType(VarType.NUM) for x in train.column_types]) assert all([x == ColumnType(VarType.NUM) for x in test.column_types]) imputer = SimpleImputer(strategy="mean") scaler = StandardScaler() res = imputer.fit_transform(train_df.values) train_expected = scaler.fit_transform(res) res = imputer.transform(test_df.values) test_expected = scaler.transform(res) assert np.all(np.isclose(train_expected, train.X)) assert np.all(np.isclose(test_expected, test.X))
def test_date_features_extractor_task(): x1 = pd.date_range(start="2020-10-17", periods=5000, freq="5D") x2 = pd.date_range(start="2007-06-06", periods=5000, freq="21S") x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) task = DateFeatures() res = task.fit_transform(data) assert res.X.shape[1] == len(task.COMPONENTS) * 2 x1_cols = [c for c in res.column_names if c.startswith("x1")] x2_cols = [c for c in res.column_names if c.startswith("x2")] x1_data = take_columns(res, x1_cols) x2_data = take_columns(res, x2_cols) extractors = { "year": lambda x: x.year, "month": lambda x: x.month, "week": lambda x: x.week, "day_of_month": lambda x: x.day, "day_of_week": lambda x: x.dayofweek, "hour": lambda x: x.hour, "minute": lambda x: x.minute, "second": lambda x: x.second, } for actual, expected, name in [(x1_data, x1, "x1"), (x2_data, x2, "x2")]: for k, v in extractors.items(): feature_name = "{} - {}".format(name, k) assert feature_name in actual.column_names expected_val = np.array([v(x) for x in expected]) idx = actual.column_names.index(feature_name) assert np.all(np.isclose(actual.X[:, idx], expected_val)) idx = actual.column_names.index(feature_name) col_type = actual.column_types[idx] if "year" in feature_name: assert col_type.var_type == VarType.NUM else: assert col_type.var_type == VarType.CAT
def data(): random_state = np.random.RandomState(RANDOM_SEED) x = random_state.random((1000, 5)) y = random_state.random((1000, )) return TaskData( X=x, column_names=["x1", "x2", "x3", "x4", "x5"], column_types=[ ColumnType(), ColumnType(), ColumnType(), ColumnType(), ColumnType(), ], y=y, )
def test_target_lag(order): x = np.random.random((1000, )) y = np.random.random((1000, )) data = TaskData( X=x, y=y, column_names=["x"], column_types=[ColumnType(VarType.NUM)], ) df = pd.DataFrame({"x": x, "y": y}) df["lag"] = df["y"].shift(order) lag = df["lag"].values task = TargetLag(order=order, handle_nan="drop") new_data = task.fit_transform(data) lag_index = new_data.column_names.index(TargetLag.PATTERN.format(order)) assert np.all(np.isclose(lag[order:], new_data.X[:, lag_index])) assert len(new_data.column_names) == 2 assert len(new_data.column_types) == 2 assert new_data.column_names[-1] == TargetLag.PATTERN.format(order) assert new_data.column_types[-1] == ColumnType(VarType.LAG)
def test_ordcat_task_handle_unknown(): x1 = np.random.choice(["a", "b", "c"], size=1000) x2 = np.random.choice(["1", "2", "3", "4", "5", "6"], size=1000) x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) task = OrdCat(min_support=0, use_other=False, handle_unknown="missing") res = task.fit_transform(data) assert res.column_names == ["x1", "x2"] assert res.column_types == [ ColumnType(VarType.CAT, level=4), ColumnType(VarType.CAT, level=7), ] expected = OrdinalEncoder().fit_transform(data.X) expected = expected + 1 assert np.all(np.isclose(res.X, expected)) # transform with new categories x1 = np.random.choice(["a", "c", "d"], size=1000) x2 = np.random.choice(["2", "3", "5", "6", "7"], size=1000) x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) new_data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) res = task.transform(new_data) mask = x1 == "d" results = res.X[:, 0][mask] assert np.unique(results) == np.array([0]) mask = x2 == "7" results = res.X[:, 1][mask] assert np.unique(results) == np.array([0])
def test_ordcat_task_handle_unknown_error(): x1 = np.random.choice(["a", "b", "c"], size=1000) x2 = np.random.choice(["1", "2", "3", "4", "5", "6"], size=1000) x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) task = OrdCat(min_support=0, use_other=False, handle_unknown="error") res = task.fit_transform(data) assert res.column_names == ["x1", "x2"] assert res.column_types == [ ColumnType(VarType.CAT, level=4), ColumnType(VarType.CAT, level=7), ] expected = OrdinalEncoder().fit_transform(data.X) expected = expected + 1 assert np.all(np.isclose(res.X, expected)) # transform with new categories x1 = np.random.choice(["a", "c", "d"], size=1000) x2 = np.random.choice(["2", "3", "5", "6", "7"], size=1000) x = np.hstack([np.reshape(x1, (-1, 1)), np.reshape(x2, (-1, 1))]) new_data = TaskData( X=x, column_names=["x1", "x2"], column_types=[ColumnType(VarType.NUM), ColumnType(VarType.NUM)], ) with pytest.raises(ValueError) as excinfo: res = task.transform(new_data) assert "Found unknown categories" == str(excinfo.value)
def test_date_pipeline(): df = create_dataset(num=0, cat=0, date=5, target=False, size=5000) train_df = df.iloc[:-1000] test_df = df.iloc[-1000:] date_pipeline = Pipeline(steps=[ Step("date", DateFeatures()), Step( "derived_processing", ColumnsProcessor(branches=[ Step("num_derived", Wrap(StandardScaler()), types=[VarType.NUM]), Step( "cat_derived", OrdCat(min_support=0, use_other=False), types=[VarType.CAT], ), ]), ), ]) train = date_pipeline.fit_transform(to_task_data(train_df)) test = date_pipeline.transform(to_task_data(test_df)) for data in [train, test]: assert data.column_types[:5] == [ColumnType(VarType.NUM)] * 5 assert set([c.var_type for c in data.column_types[5:]]) == set([VarType.CAT]) assert all([c.level > 0 for c in data.column_types[5:]]) date_features = DateFeatures() dates_train = date_features.fit_transform(to_task_data(train_df)) dates_test = date_features.transform(to_task_data(test_df)) num_train = take_columns(dates_train, types=[VarType.NUM]) cat_train = take_columns(dates_train, types=[VarType.CAT]) scaler = StandardScaler() enc = OrdinalEncoder() num_train = scaler.fit_transform(num_train.X) cat_train = enc.fit_transform(cat_train.X) cat_train = cat_train + 1 assert np.all( np.isclose(num_train, take_columns(train, types=[VarType.NUM]).X)) assert np.all( np.isclose(cat_train, take_columns(train, types=[VarType.CAT]).X)) num_test = take_columns(dates_test, types=[VarType.NUM]) cat_test = take_columns(dates_test, types=[VarType.CAT]) num_test = scaler.transform(num_test.X) cat_test = enc.transform(cat_test.X) cat_test = cat_test + 1 assert np.all( np.isclose(num_test, take_columns(test, types=[VarType.NUM]).X)) assert np.all( np.isclose(cat_test, take_columns(test, types=[VarType.CAT]).X))