Example #1
0
def test_smart_encoder_y_var():
    import numpy as np
    import pandas as pd

    from foreshadow.smart import CategoricalEncoder
    from foreshadow.concrete import FixedLabelEncoder as LabelEncoder

    y_df = pd.DataFrame({"A": np.array([1, 2, 10] * 3)})
    smart_coder = CategoricalEncoder(y_var=True)

    assert isinstance(smart_coder.fit(y_df).transformer, LabelEncoder)
    assert np.array_equal(
        smart_coder.transform(y_df).values.ravel(), np.array([0, 1, 2] * 3))
Example #2
0
def test_smart_encoder_delimmited():
    import pandas as pd
    from foreshadow.smart import CategoricalEncoder
    from foreshadow.concrete import DummyEncoder
    from foreshadow.concrete import NaNFiller

    data = pd.DataFrame({"test": ["a", "a,b,c", "a,b", "a,c"]})
    smart_coder = CategoricalEncoder()
    transformer = smart_coder.fit(data).transformer

    assert isinstance(transformer, Pipeline)
    assert isinstance(transformer.steps[0][1], NaNFiller)
    assert isinstance(transformer.steps[1][1], DummyEncoder)
    assert len(transformer.steps) == 2
Example #3
0
def test_smart_encoder_more_than_30_levels_with_overwritten_cutoff():
    import numpy as np
    from foreshadow.smart import CategoricalEncoder
    from foreshadow.concrete import OneHotEncoder
    from foreshadow.concrete import NaNFiller

    np.random.seed(0)
    gt_30_random_data = np.random.choice(31, size=500)
    smart_coder = CategoricalEncoder(unique_num_cutoff=35)
    transformer = smart_coder.fit(gt_30_random_data).transformer

    assert isinstance(transformer, Pipeline)
    assert isinstance(transformer.steps[0][1], NaNFiller)
    assert isinstance(transformer.steps[1][1], OneHotEncoder)
    assert len(transformer.steps) == 2
Example #4
0
def test_smart_encoder_more_than_30_levels_that_reduces():
    import numpy as np

    from foreshadow.smart import CategoricalEncoder
    from foreshadow.concrete import OneHotEncoder

    np.random.seed(0)
    gt_30_random_data = np.concatenate(
        [np.random.choice(29, size=500),
         np.array([31, 32, 33, 34, 35, 36])])
    smart_coder = CategoricalEncoder()
    assert isinstance(
        smart_coder.fit(gt_30_random_data).transformer.steps[-1][1],
        OneHotEncoder,
    )
Example #5
0
def test_smart_encoder_less_than_30_levels():
    import numpy as np

    from foreshadow.smart import CategoricalEncoder
    from foreshadow.concrete import OneHotEncoder
    from foreshadow.concrete import NaNFiller

    np.random.seed(0)
    leq_30_random_data = np.random.choice(30, size=500)
    smart_coder = CategoricalEncoder()
    transformer = smart_coder.fit(leq_30_random_data).transformer
    assert isinstance(transformer, Pipeline)
    assert isinstance(transformer.steps[0][1], NaNFiller)
    assert isinstance(transformer.steps[1][1], OneHotEncoder)
    assert len(transformer.steps) == 2
    res = smart_coder.transform(leq_30_random_data)
    assert len(res.columns) == 30
Example #6
0
def test_smart_encoder_more_than_30_levels():
    import numpy as np

    from foreshadow.smart import CategoricalEncoder
    from foreshadow.concrete import HashingEncoder
    from foreshadow.concrete import NaNFiller

    np.random.seed(0)
    gt_30_random_data = np.random.choice(31, size=500)
    gt_30_random_data = [item for item in gt_30_random_data.astype(str)]
    gt_30_random_data[0] = np.nan
    smart_coder = CategoricalEncoder()
    transformer = smart_coder.fit(gt_30_random_data).transformer
    assert isinstance(transformer, Pipeline)
    assert isinstance(transformer.steps[0][1], NaNFiller)
    assert isinstance(transformer.steps[1][1], HashingEncoder)
    assert len(transformer.steps) == 2
    res = transformer.transform(gt_30_random_data)
    assert len(res.columns) == 30
Example #7
0
    def __init__(
            self,
            cache_manager=None,
            flattener_kwargs=None,
            cleaner_kwargs=None,
            intent_kwargs=None,
            summarizer_kwargs=None,
            # engineerer_kwargs=None,
            preprocessor_kwargs=None,
            # reducer_kwargs=None,
            exporter_kwargs=None,
            problem_type=None,
            y_var=None,
            **kwargs):
        self.flattener_kwargs = _none_to_dict("flattener_kwargs",
                                              flattener_kwargs, cache_manager)
        self.cleaner_kwargs = _none_to_dict("cleaner_kwargs", cleaner_kwargs,
                                            cache_manager)
        self.intent_kwargs = _none_to_dict("intent_kwargs", intent_kwargs,
                                           cache_manager)
        self.summarizer_kwargs = _none_to_dict("summarizer_kwargs",
                                               summarizer_kwargs,
                                               cache_manager)
        # # engineerer_kwargs_ = _none_to_dict(
        # #     "engineerer_kwargs", engineerer_kwargs, cache_manager
        # # )
        self.preprocessor_kwargs = _none_to_dict("preprocessor_kwargs",
                                                 preprocessor_kwargs,
                                                 cache_manager)
        # # reducer_kwargs_ = _none_to_dict(
        # #     "reducer_kwargs", reducer_kwargs, cache_manager
        # # )
        self.exporter_kwargs = _none_to_dict("exporter_kwargs",
                                             exporter_kwargs, cache_manager)
        self.y_var = y_var
        self.problem_type = problem_type
        if not y_var:
            steps = [
                ("data_flattener", FlattenMapper(**self.flattener_kwargs)),
                ("data_cleaner", CleanerMapper(**self.cleaner_kwargs)),
                ("intent", IntentMapper(**self.intent_kwargs)),
                (
                    "feature_summarizer",
                    FeatureSummarizerMapper(**self.summarizer_kwargs),
                ),
                # (
                #     "feature_engineerer",
                #     FeatureEngineererMapper(**self.engineerer_kwargs_),
                # ),
                (
                    "feature_preprocessor",
                    Preprocessor(**self.preprocessor_kwargs),
                ),
                # (
                #       "feature_reducer",
                #       FeatureReducerMapper(**self.reducer_kwargs)
                # ),
                (
                    "feature_exporter",
                    DataExporterMapper(**self.exporter_kwargs),
                ),
            ]
        else:
            if problem_type == ProblemType.REGRESSION:
                steps = [(
                    "feature_summarizer",
                    FeatureSummarizerMapper(y_var, problem_type,
                                            **self.summarizer_kwargs),
                )]
            elif problem_type == ProblemType.CLASSIFICATION:
                steps = [
                    (
                        "feature_summarizer",
                        FeatureSummarizerMapper(y_var, problem_type,
                                                **self.summarizer_kwargs),
                    ),
                    (
                        "output",
                        CategoricalEncoder(y_var=True,
                                           cache_manager=cache_manager),
                    ),
                ]
            else:
                raise ValueError("Invalid Problem "
                                 "Type {}".format(problem_type))
        if "steps" in kwargs:  # needed for sklearn estimator clone,
            # which will try to init the object using get_params.
            steps = kwargs.pop("steps")

        self.cache_manager = cache_manager
        self.y_var = y_var
        self.problem_type = problem_type
        super().__init__(steps, **kwargs)