def test_freqfeature_handles_nans_correctly(categorical_na): freq_feature = FreqFeature() result = freq_feature.fit_transform(categorical_na) assert isinstance(result, pd.DataFrame) assert len(categorical_na) == len(result) assert set(categorical_na.columns) == set(result) assert 0 == result.isna().sum().sum() assert 1 / (len(categorical_na) - 1) == result.iloc[0, 0]
def test_freq_features_returns_0_when_unseen_value_is_given(categorical): freq_feature = FreqFeature() freq_feature.fit(categorical) new_data = pd.DataFrame({"category_a": ["a1", "a2", "c25"]}) result = freq_feature.transform(new_data) assert isinstance(result, pd.DataFrame) assert len(new_data) == len(result) assert 0 == result.iloc[-1, 0]
def test_freqfeature_returns_correctly(categorical): freq_feature = FreqFeature() result = freq_feature.fit_transform(categorical) assert isinstance(result, pd.DataFrame) assert len(categorical) == len(result) assert set(categorical.columns) == set(result.columns) for col in result.columns: assert pd.api.types.is_numeric_dtype(result[col]) assert 1 / len(categorical) == result.iloc[0, 0] assert all(1 == result.sum())
def test_freq_feature_can_be_used_in_cross_validation_string_data( self, categorical: pd.DataFrame): pipe = create_pipeline(FreqFeature()) score = cross_val_score(pipe, categorical, np.array([1, 0, 1, 0]), cv=2) assert np.all(score >= 0)
def test_freq_feature_can_be_used_in_grid_search( self, categorical: pd.DataFrame): pipe = create_pipeline(FreqFeature()) model = GridSearchCV( pipe, param_grid={"clf__strategy": ["stratified", "most_frequent"]}, cv=2) model.fit(categorical, [1, 0, 1, 0]) assert hasattr(model, "best_estimator_")
def test_works_without_args(self): assert FreqFeature()