def test__CountThresholder__fit_transform__correct_response_new_categories( columns): transformer = CountThresholder() transformer.fit(columns) test = transformer.fit_transform( np.array(['new', 'new'], dtype='O').reshape(2, 1)) assert list(test) == ['other', 'other']
def test__CountThresholder__fit__transform__correct_response_max_categories( columns): transformer = CountThresholder(max_categories=1) transformer.fit(columns) train = transformer.transform(columns) assert (list(train[:, 0]) == [ 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'five_1', 'five_1', 'five_1', 'five_1', 'five_1' ]) assert (list(train[:, 1]) == [ 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'eight_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2' ])
def test__CountThresholder__fit__transform__correct_response_min_rel_freq_float( columns): transformer = CountThresholder(min_rel_freq=0.2) transformer.fit(columns) train = transformer.transform(columns) assert (list(train[:, 0]) == [ 'other', 'other', 'other', 'three_1', 'three_1', 'three_1', 'four_1', 'four_1', 'four_1', 'four_1', 'five_1', 'five_1', 'five_1', 'five_1', 'five_1' ]) assert (list(train[:, 1]) == [ 'seven_2', 'seven_2', 'seven_2', 'seven_2', 'seven_2', 'seven_2', 'seven_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2' ])
def test__CountThresholder__fit__1dim_array_input_assertion(): transformer = CountThresholder() with pytest.raises(AssertionError, match='X must have dimension 2'): transformer.fit(np.array(['two', 'two'])) array = np.array(['two', 'two']).reshape(2, 1) transformer.fit(array) with pytest.raises(AssertionError, match='X must have dimension 2'): transformer.transform(np.array(['two', 'two']))
def test__CountThresholder__init__correct_instantiation(): transformer = CountThresholder(max_categories=5, min_rel_freq=0.01, category_name='test_name') assert transformer.max_categories_ == 5 assert transformer.min_rel_freq_ == 0.01 assert transformer.category_name_ == 'test_name'
def test__CountThresholder__fit__transform__no_array_input_assertion(): transformer = CountThresholder() with pytest.raises(AssertionError, match='X must be a np.ndarray'): _ = transformer.fit(['two', 'two']) with pytest.raises(AssertionError, match='X must be a np.ndarray'): _ = transformer.fit(1) array = np.array(['two', 'two']).reshape(2, 1) transformer.fit(array) with pytest.raises(AssertionError, match='X must be a np.ndarray'): _ = transformer.transform(['two', 'two']) with pytest.raises(AssertionError, match='X must be a np.ndarray'): _ = transformer.transform(1)
def create_pipeline(search_space): pipeline = make_pipeline( make_union( make_pipeline( SelectDtypeColumns(exclude=['number', 'bool']), CountThresholder(**search_space['countthresholder']), CategoricalEncoder(**search_space['categoricalencoder'])), make_pipeline(SelectDtypeColumns(include=['number', 'bool']), SimpleImputer(**search_space['simpleimputer']))), LogisticRegression(solver='liblinear', **search_space['logisticregression'])) return pipeline
def test__CountThresholder__fit__transform__same_number_features(): transformer = CountThresholder() array_1_features = np.array(['two', 'two']).reshape(2, 1) transformer.fit(array_1_features) array_2_features = np.array([['two', 'two'], ['two', 'two']]).reshape(2, 2) with pytest.raises(AssertionError, match='the number of columns is not correct'): transformer.transform(array_2_features)
def _create_pipeline(self, search_space): """ Create the sklearn model `pipeline` object. The pipeline consist in two main steps; encoding and scoring. The former is global and apply a transformation of numerical and categorical features if needed. The former is the model class that generates predictions. Parameters ---------- search_space: dict Dictionary including the parameters space for optimization Returns ------- sklearn.pipeline.Pipeline Pipeline for feature encoding and model scoring """ # TODO: Parametrize for multiple and different input model classes numerical_encoder = make_pipeline( SelectDtypeColumns(include=['number', 'bool']), SimpleImputer(**search_space['simpleimputer'])) categorical_encoder = make_pipeline( SelectDtypeColumns(exclude=['number', 'bool']), CountThresholder(**search_space['countthresholder']), CategoricalEncoder(**search_space['categoricalencoder'])) if self.categorical_columns and self.numerical_columns: encoder = make_union(numerical_encoder, categorical_encoder) elif self.numerical_columns: encoder = numerical_encoder else: encoder = categorical_encoder model_object = LogisticRegression(solver='liblinear', **search_space['logisticregression']) pipeline = make_pipeline(encoder, model_object) return pipeline
def test__CountThresholder__init__category_name_incorrect_instantiation(): with pytest.raises(AssertionError, match='category_name should be str'): CountThresholder(category_name=5)
def test__CountThresholder__init__min_rel_freq_incorrect_instantiation_negative_freq( ): with pytest.raises(AssertionError, match='min_rel_freq should be between 0.0 and 1.0'): CountThresholder(min_rel_freq=5.0)
def test__CountThresholder__init__min_rel_freq_incorrect_instantiation_invalid_type( ): with pytest.raises(AssertionError, match='min_rel_freq must be float'): CountThresholder(min_rel_freq='3')
def test__CountThresholder__init__max_categories_incorrect_instantiation(): with pytest.raises(AssertionError, match='max_categories must be greater than 1'): CountThresholder(max_categories=-3)
def test__CountThresholder__fit_transform__method_correct_response(): transformer = CountThresholder(min_rel_freq=0.1) array = np.array(['two', 'two']).reshape(2, 1) assert list(transformer.fit_transform(array)) == ['two', 'two']