Exemple #1
0
def test__CountThresholder__fit_transform__correct_response_new_categories(
        columns):
    transformer = CountThresholder()
    transformer.fit(columns)
    test = transformer.fit_transform(
        np.array(['new', 'new'], dtype='O').reshape(2, 1))
    assert list(test) == ['other', 'other']
Exemple #2
0
def test__CountThresholder__fit__transform__correct_response_max_categories(
        columns):
    transformer = CountThresholder(max_categories=1)
    transformer.fit(columns)
    train = transformer.transform(columns)
    assert (list(train[:, 0]) == [
        'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other',
        'other', 'other', 'five_1', 'five_1', 'five_1', 'five_1', 'five_1'
    ])
    assert (list(train[:, 1]) == [
        'other', 'other', 'other', 'other', 'other', 'other', 'other',
        'eight_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2',
        'eight_2', 'eight_2'
    ])
Exemple #3
0
def test__CountThresholder__fit__transform__correct_response_min_rel_freq_float(
        columns):
    transformer = CountThresholder(min_rel_freq=0.2)
    transformer.fit(columns)
    train = transformer.transform(columns)
    assert (list(train[:, 0]) == [
        'other', 'other', 'other', 'three_1', 'three_1', 'three_1', 'four_1',
        'four_1', 'four_1', 'four_1', 'five_1', 'five_1', 'five_1', 'five_1',
        'five_1'
    ])
    assert (list(train[:, 1]) == [
        'seven_2', 'seven_2', 'seven_2', 'seven_2', 'seven_2', 'seven_2',
        'seven_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2', 'eight_2',
        'eight_2', 'eight_2', 'eight_2'
    ])
Exemple #4
0
def test__CountThresholder__fit__1dim_array_input_assertion():
    transformer = CountThresholder()
    with pytest.raises(AssertionError, match='X must have dimension 2'):
        transformer.fit(np.array(['two', 'two']))
    array = np.array(['two', 'two']).reshape(2, 1)
    transformer.fit(array)
    with pytest.raises(AssertionError, match='X must have dimension 2'):
        transformer.transform(np.array(['two', 'two']))
Exemple #5
0
def test__CountThresholder__init__correct_instantiation():
    transformer = CountThresholder(max_categories=5,
                                   min_rel_freq=0.01,
                                   category_name='test_name')
    assert transformer.max_categories_ == 5
    assert transformer.min_rel_freq_ == 0.01
    assert transformer.category_name_ == 'test_name'
Exemple #6
0
def test__CountThresholder__fit__transform__no_array_input_assertion():
    transformer = CountThresholder()
    with pytest.raises(AssertionError, match='X must be a np.ndarray'):
        _ = transformer.fit(['two', 'two'])
    with pytest.raises(AssertionError, match='X must be a np.ndarray'):
        _ = transformer.fit(1)
    array = np.array(['two', 'two']).reshape(2, 1)
    transformer.fit(array)
    with pytest.raises(AssertionError, match='X must be a np.ndarray'):
        _ = transformer.transform(['two', 'two'])
    with pytest.raises(AssertionError, match='X must be a np.ndarray'):
        _ = transformer.transform(1)
Exemple #7
0
 def create_pipeline(search_space):
     pipeline = make_pipeline(
         make_union(
             make_pipeline(
                 SelectDtypeColumns(exclude=['number', 'bool']),
                 CountThresholder(**search_space['countthresholder']),
                 CategoricalEncoder(**search_space['categoricalencoder'])),
             make_pipeline(SelectDtypeColumns(include=['number', 'bool']),
                           SimpleImputer(**search_space['simpleimputer']))),
         LogisticRegression(solver='liblinear',
                            **search_space['logisticregression']))
     return pipeline
Exemple #8
0
def test__CountThresholder__fit__transform__same_number_features():
    transformer = CountThresholder()
    array_1_features = np.array(['two', 'two']).reshape(2, 1)
    transformer.fit(array_1_features)
    array_2_features = np.array([['two', 'two'], ['two', 'two']]).reshape(2, 2)
    with pytest.raises(AssertionError,
                       match='the number of columns is not correct'):
        transformer.transform(array_2_features)
Exemple #9
0
    def _create_pipeline(self, search_space):
        """
        Create the sklearn model `pipeline` object.

        The pipeline consist in two main steps; encoding and scoring.
        The former is global and apply a transformation of numerical
        and categorical features if needed. The former is the model
        class that generates predictions.

        Parameters
        ----------
        search_space: dict
            Dictionary including the parameters space for optimization

        Returns
        -------
        sklearn.pipeline.Pipeline
            Pipeline for feature encoding and model scoring
        """
        # TODO: Parametrize for multiple and different input model classes
        numerical_encoder = make_pipeline(
            SelectDtypeColumns(include=['number', 'bool']),
            SimpleImputer(**search_space['simpleimputer']))
        categorical_encoder = make_pipeline(
            SelectDtypeColumns(exclude=['number', 'bool']),
            CountThresholder(**search_space['countthresholder']),
            CategoricalEncoder(**search_space['categoricalencoder']))
        if self.categorical_columns and self.numerical_columns:
            encoder = make_union(numerical_encoder, categorical_encoder)
        elif self.numerical_columns:
            encoder = numerical_encoder
        else:
            encoder = categorical_encoder

        model_object = LogisticRegression(solver='liblinear',
                                          **search_space['logisticregression'])
        pipeline = make_pipeline(encoder, model_object)
        return pipeline
Exemple #10
0
def test__CountThresholder__init__category_name_incorrect_instantiation():
    with pytest.raises(AssertionError, match='category_name should be str'):
        CountThresholder(category_name=5)
Exemple #11
0
def test__CountThresholder__init__min_rel_freq_incorrect_instantiation_negative_freq(
):
    with pytest.raises(AssertionError,
                       match='min_rel_freq should be between 0.0 and 1.0'):
        CountThresholder(min_rel_freq=5.0)
Exemple #12
0
def test__CountThresholder__init__min_rel_freq_incorrect_instantiation_invalid_type(
):
    with pytest.raises(AssertionError, match='min_rel_freq must be float'):
        CountThresholder(min_rel_freq='3')
Exemple #13
0
def test__CountThresholder__init__max_categories_incorrect_instantiation():
    with pytest.raises(AssertionError,
                       match='max_categories must be greater than 1'):
        CountThresholder(max_categories=-3)
Exemple #14
0
def test__CountThresholder__fit_transform__method_correct_response():
    transformer = CountThresholder(min_rel_freq=0.1)
    array = np.array(['two', 'two']).reshape(2, 1)
    assert list(transformer.fit_transform(array)) == ['two', 'two']