def test_smart_encoder_y_var(): import numpy as np import pandas as pd from foreshadow.smart import CategoricalEncoder from foreshadow.concrete import FixedLabelEncoder as LabelEncoder y_df = pd.DataFrame({"A": np.array([1, 2, 10] * 3)}) smart_coder = CategoricalEncoder(y_var=True) assert isinstance(smart_coder.fit(y_df).transformer, LabelEncoder) assert np.array_equal( smart_coder.transform(y_df).values.ravel(), np.array([0, 1, 2] * 3))
def test_smart_encoder_delimmited(): import pandas as pd from foreshadow.smart import CategoricalEncoder from foreshadow.concrete import DummyEncoder from foreshadow.concrete import NaNFiller data = pd.DataFrame({"test": ["a", "a,b,c", "a,b", "a,c"]}) smart_coder = CategoricalEncoder() transformer = smart_coder.fit(data).transformer assert isinstance(transformer, Pipeline) assert isinstance(transformer.steps[0][1], NaNFiller) assert isinstance(transformer.steps[1][1], DummyEncoder) assert len(transformer.steps) == 2
def test_smart_encoder_more_than_30_levels_with_overwritten_cutoff(): import numpy as np from foreshadow.smart import CategoricalEncoder from foreshadow.concrete import OneHotEncoder from foreshadow.concrete import NaNFiller np.random.seed(0) gt_30_random_data = np.random.choice(31, size=500) smart_coder = CategoricalEncoder(unique_num_cutoff=35) transformer = smart_coder.fit(gt_30_random_data).transformer assert isinstance(transformer, Pipeline) assert isinstance(transformer.steps[0][1], NaNFiller) assert isinstance(transformer.steps[1][1], OneHotEncoder) assert len(transformer.steps) == 2
def test_smart_encoder_more_than_30_levels_that_reduces(): import numpy as np from foreshadow.smart import CategoricalEncoder from foreshadow.concrete import OneHotEncoder np.random.seed(0) gt_30_random_data = np.concatenate( [np.random.choice(29, size=500), np.array([31, 32, 33, 34, 35, 36])]) smart_coder = CategoricalEncoder() assert isinstance( smart_coder.fit(gt_30_random_data).transformer.steps[-1][1], OneHotEncoder, )
def test_smart_encoder_less_than_30_levels(): import numpy as np from foreshadow.smart import CategoricalEncoder from foreshadow.concrete import OneHotEncoder from foreshadow.concrete import NaNFiller np.random.seed(0) leq_30_random_data = np.random.choice(30, size=500) smart_coder = CategoricalEncoder() transformer = smart_coder.fit(leq_30_random_data).transformer assert isinstance(transformer, Pipeline) assert isinstance(transformer.steps[0][1], NaNFiller) assert isinstance(transformer.steps[1][1], OneHotEncoder) assert len(transformer.steps) == 2 res = smart_coder.transform(leq_30_random_data) assert len(res.columns) == 30
def test_smart_encoder_more_than_30_levels(): import numpy as np from foreshadow.smart import CategoricalEncoder from foreshadow.concrete import HashingEncoder from foreshadow.concrete import NaNFiller np.random.seed(0) gt_30_random_data = np.random.choice(31, size=500) gt_30_random_data = [item for item in gt_30_random_data.astype(str)] gt_30_random_data[0] = np.nan smart_coder = CategoricalEncoder() transformer = smart_coder.fit(gt_30_random_data).transformer assert isinstance(transformer, Pipeline) assert isinstance(transformer.steps[0][1], NaNFiller) assert isinstance(transformer.steps[1][1], HashingEncoder) assert len(transformer.steps) == 2 res = transformer.transform(gt_30_random_data) assert len(res.columns) == 30
def __init__( self, cache_manager=None, flattener_kwargs=None, cleaner_kwargs=None, intent_kwargs=None, summarizer_kwargs=None, # engineerer_kwargs=None, preprocessor_kwargs=None, # reducer_kwargs=None, exporter_kwargs=None, problem_type=None, y_var=None, **kwargs): self.flattener_kwargs = _none_to_dict("flattener_kwargs", flattener_kwargs, cache_manager) self.cleaner_kwargs = _none_to_dict("cleaner_kwargs", cleaner_kwargs, cache_manager) self.intent_kwargs = _none_to_dict("intent_kwargs", intent_kwargs, cache_manager) self.summarizer_kwargs = _none_to_dict("summarizer_kwargs", summarizer_kwargs, cache_manager) # # engineerer_kwargs_ = _none_to_dict( # # "engineerer_kwargs", engineerer_kwargs, cache_manager # # ) self.preprocessor_kwargs = _none_to_dict("preprocessor_kwargs", preprocessor_kwargs, cache_manager) # # reducer_kwargs_ = _none_to_dict( # # "reducer_kwargs", reducer_kwargs, cache_manager # # ) self.exporter_kwargs = _none_to_dict("exporter_kwargs", exporter_kwargs, cache_manager) self.y_var = y_var self.problem_type = problem_type if not y_var: steps = [ ("data_flattener", FlattenMapper(**self.flattener_kwargs)), ("data_cleaner", CleanerMapper(**self.cleaner_kwargs)), ("intent", IntentMapper(**self.intent_kwargs)), ( "feature_summarizer", FeatureSummarizerMapper(**self.summarizer_kwargs), ), # ( # "feature_engineerer", # FeatureEngineererMapper(**self.engineerer_kwargs_), # ), ( "feature_preprocessor", Preprocessor(**self.preprocessor_kwargs), ), # ( # "feature_reducer", # FeatureReducerMapper(**self.reducer_kwargs) # ), ( "feature_exporter", DataExporterMapper(**self.exporter_kwargs), ), ] else: if problem_type == ProblemType.REGRESSION: steps = [( "feature_summarizer", FeatureSummarizerMapper(y_var, problem_type, **self.summarizer_kwargs), )] elif problem_type == ProblemType.CLASSIFICATION: steps = [ ( "feature_summarizer", FeatureSummarizerMapper(y_var, problem_type, **self.summarizer_kwargs), ), ( "output", CategoricalEncoder(y_var=True, cache_manager=cache_manager), ), ] else: raise ValueError("Invalid Problem " "Type {}".format(problem_type)) if "steps" in kwargs: # needed for sklearn estimator clone, # which will try to init the object using get_params. steps = kwargs.pop("steps") self.cache_manager = cache_manager self.y_var = y_var self.problem_type = problem_type super().__init__(steps, **kwargs)