def test_date_encoder_returns_only_day(self, dates: pd.DataFrame): date_coder = DateEncoder(day=True, month=False, week=False, year=False) result = date_coder.fit_transform(dates) assert isinstance(result, pd.DataFrame) assert 1 == len(result.columns) assert len(dates) == len(result) assert "date_a_year" not in result.columns assert "date_a_day" in result.columns assert "date_a_month" not in result.columns assert "date_a_week" not in result.columns
def test_date_encoder_returns_only_week(dates): week_coder = DateEncoder(day=False, month=False, week=True, year=False) result = week_coder.fit_transform(dates) assert isinstance(result, pd.DataFrame) assert 1 == len(result.columns) assert len(dates) == len(result) assert 'date_a_year' not in result.columns assert 'date_a_day' not in result.columns assert 'date_a_month' not in result.columns assert 'date_a_week' in result.columns
def test_date_encoder_returns_correctly(dates): date_coder = DateEncoder() result = date_coder.fit_transform(dates) assert isinstance(result, pd.DataFrame) assert 4 == len(result.columns) assert len(dates) == len(result) for col in result.columns: assert pd.api.types.is_numeric_dtype(result[col]) assert 'date_a' not in result.columns assert 'date_a_year' in result.columns assert 'date_a_day' in result.columns assert 'date_a_month' in result.columns assert 'date_a_week' in result.columns
def test_date_encoder_works_in_grid_search(self, dates: pd.DataFrame): pipe = create_pipeline(DateEncoder()) grid = GridSearchCV( pipe, param_grid={"clf__strategy": ["stratified", "most_frequent"]}, cv=2, ) grid.fit(dates, [0, 0, 1, 1]) assert hasattr(grid, "best_score_")
def test_date_encoder_works_in_cv(self, dates: pd.DataFrame): pipe = create_pipeline(DateEncoder()) score = cross_val_score(pipe, dates, y=[0, 0, 1, 1], n_jobs=2, cv=2) assert 2 == len(score)
def test_works_without_args(self): assert DateEncoder()
""" host_since ========== When started hosting. Hypothesis that being a host for longer affects the price - they might be able to charge a different price. For our solution, we can set it to 0 or ask. Is a date - note that date is not a dtype, but we can set read_csv to parse it automatically as a date dtype: datetime """ from ml_tooling.transformers import Select, DateEncoder from sklearn.pipeline import Pipeline host_since = Pipeline([("select", Select("host_since")), ("date_encoder", DateEncoder())])