def test_fit_eliminates_constant_columns_multicol_input(mode): # set up data. Properties: # Hour: Constant thrghout - eliminate # Year: Constant in both, but has different value accross columns - should eliminate # Month: Constant in column 2, not in 1 - should not eliminate # Day of month: not constant in both columns - should not eliminate col1 = [ parser.parse("Jan 5th, 2012"), parser.parse("Feb 2, 2012"), parser.parse("Jan 1st, 2012"), ] col2 = [ parser.parse("Dec 2th, 2013"), parser.parse("Dec 3th, 2013"), parser.parse("Dec 3th, 2013"), ] cur_data = np.array([col1, col2]).T dtv = DateTimeVectorizer( mode=mode, extract=[ DateTimeDefinition.HOUR.value, DateTimeDefinition.DAY_OF_MONTH.value, DateTimeDefinition.YEAR.value, DateTimeDefinition.MONTH.value, ], ) # taking only odd items. Year and month are always the same. dtv = dtv.fit(cur_data) # Year and month are constants, make sure they are out assert dtv.extract_ == [ DateTimeDefinition.DAY_OF_MONTH.value, DateTimeDefinition.MONTH.value ]
def test_fit_transform_cyclic_leaves_year(): extract_keys = [k for k in dir(DateTimeDefinition) if not k.startswith("_")] extract = [DateTimeDefinition.__dict__[k].value for k in extract_keys] dtv = DateTimeVectorizer(mode="cyclic", extract=extract, ignore_constant_columns=False) output = dtv.fit_transform(data) loc_year = extract_keys.index("YEAR") loc_year *= 2 np.testing.assert_array_equal(output[:, loc_year], np.array([2012, 2011, 2012, 2012, 2012, 2018])) assert output.shape[1] == len(dtv.extract_) * 2 - 1
def test_fit_transform_works_with_non_np_input(mode): dtv = DateTimeVectorizer( mode=mode, extract=[ DateTimeDefinition.HOUR.value, DateTimeDefinition.SECOND.value, DateTimeDefinition.YEAR.value, DateTimeDefinition.MONTH.value, ], ) output = dtv.fit_transform(data_array) assert output.shape[0] == len(data_array) assert output.shape[1] > 1
def test_transform_categorical(): extract_keys = [k for k in dir(DateTimeDefinition) if not k.startswith("_")] extract = [DateTimeDefinition.__dict__[k].value for k in extract_keys] dtv = DateTimeVectorizer(mode="ordinal", extract=extract, ignore_constant_columns=False) dtv.fit(data) output = dtv.transform(data) assert np.all(output >= 0) loc_year = extract_keys.index("YEAR") np.testing.assert_array_equal(output[:, loc_year], np.array([2012, 2011, 2012, 2012, 2012, 2018])) loc_month = extract_keys.index("MONTH") np.testing.assert_array_equal(output[:, loc_month], np.array([0, 1, 0, 11, 0, 0]))
def test_fit_eliminates_constant_columns(): dtv = DateTimeVectorizer( mode="ordinal", extract=[ DateTimeDefinition.HOUR.value, DateTimeDefinition.SECOND.value, DateTimeDefinition.YEAR.value, DateTimeDefinition.MONTH.value, ], ) # taking only odd items. Year and month are always the same. cur_data = data.reshape((-1, 2))[:, 0].reshape((-1, 1)) dtv = dtv.fit(cur_data) # Year and month are constants, make sure they are out assert dtv.extract_ == [DateTimeDefinition.HOUR.value, DateTimeDefinition.SECOND.value]
def test_cyclic_transform_outputs_correct_cyclic_values(data_shape): size = int(np.prod(data_shape)) data = np.arange(size).reshape(data_shape) ret = DateTimeVectorizer._cyclic_transform(data, low=0, high=size - 1) ret = ret.reshape((-1, 2)) ret = ret**2 assert np.linalg.norm(np.sum(ret, axis=1) - 1) < 1e-8
def test_date_time_vectorizer(): from sagemaker_sklearn_extension.feature_extraction.date_time import DateTimeVectorizer st_helper = SklearnTestHelper() data = np.array( [ "Jan 3th, 2018, 1:34am", "Feb 11th, 2012, 11:34:59pm", ] ).reshape((-1, 1)) translated_data = np.array( [[3, 2018, 1, 34, 0, 1, 1], [6, 2012, 23, 34, 59, 2, 6]], dtype=np.float32 ) date_time = DateTimeVectorizer(mode="ordinal", ignore_constant_columns=True) python_out = date_time.fit_transform(data) dshape = (relay.Any(), 7) st_helper.compile(date_time, dshape, "float32", "transform") tvm_out = st_helper.run(translated_data) tvm.testing.assert_allclose(python_out, tvm_out, rtol=1e-5, atol=1e-5) date_time = DateTimeVectorizer(mode="cyclic", ignore_constant_columns=True) python_out = date_time.fit_transform(data) dshape = (relay.Any(), 7) st_helper.compile(date_time, dshape, "float32", "transform")
def test_cyclic_transform_outputs_correct_shape(data_shape): size = int(np.prod(data_shape)) data = np.arange(size).reshape(data_shape) ret = DateTimeVectorizer._cyclic_transform(data, low=0, high=size - 1) new_shape = list(data_shape) new_shape[-1] *= 2 new_shape = tuple(new_shape) assert ret.shape == new_shape ret = ret.reshape((-1, 2)) ret = ret**2 assert np.linalg.norm(np.sum(ret, axis=1) - 1) < 1e-8
def test_fit_transform_default_datetime(): cur_data_array = [["Monday"], ["Tuesday"], ["Friday"]] dtv = DateTimeVectorizer(mode="ordinal", ignore_constant_columns=False, default_datetime=datetime(1900, 1, 1)) processed = dtv.fit_transform(cur_data_array) year_location = dtv.extract_.index(DateTimeDefinition.YEAR.value) month_location = dtv.extract_.index(DateTimeDefinition.MONTH.value) weekday_location = dtv.extract_.index(DateTimeDefinition.WEEKDAY.value) assert processed[0, year_location] == 1900 assert processed[0, month_location] == 0 assert processed[0, weekday_location] == 0 assert processed[1, year_location] == 1900 assert processed[1, month_location] == 0 assert processed[1, weekday_location] == 1 assert processed[2, year_location] == 1900 assert processed[2, month_location] == 0 assert processed[2, weekday_location] == 4
def build_feature_transform(): """ Returns the model definition representing feature processing.""" # These features can be parsed as numeric. numeric = HEADER.as_feature_indices([ 'age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed' ]) # These features contain a relatively small number of unique items. categorical = HEADER.as_feature_indices([ 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome' ]) # These features can be parsed as date or time. datetime = HEADER.as_feature_indices(['month', 'day_of_week']) numeric_processors = Pipeline( steps=[('robustimputer', RobustImputer(strategy='constant', fill_values=nan))]) categorical_processors = Pipeline(steps=[('thresholdonehotencoder', ThresholdOneHotEncoder( threshold=301))]) datetime_processors = Pipeline( steps=[('datetimevectorizer', DateTimeVectorizer(mode='ordinal'))]) column_transformer = ColumnTransformer(transformers=[( 'numeric_processing', numeric_processors, numeric ), ('categorical_processing', categorical_processors, categorical), ('datetime_processing', datetime_processors, datetime)]) return Pipeline(steps=[( 'column_transformer', column_transformer), ('robuststandardscaler', RobustStandardScaler())])
def test_fit_transform_accepts_mixed_str_datetime(): cur_data_array = data_array + [["Feb 12th, 15:33, 2011"], ["Nov 5th, 1am, 1975"], [432], [None], ["Feb 45th, 2018"]] dtv = DateTimeVectorizer(mode="ordinal") processed = dtv.fit_transform(cur_data_array) year_location = dtv.extract_.index(DateTimeDefinition.YEAR.value) assert processed[0, year_location] == 2012 assert processed[-4, year_location] == 1975 assert np.isnan(processed[-3, year_location]) assert np.isnan(processed[-2, year_location]) assert np.isnan(processed[-1, year_location]) dtv = DateTimeVectorizer(mode="cyclic") processed = dtv.fit_transform(cur_data_array) assert all(np.isnan(processed[-1])) assert not any(np.isnan(processed[-4])) assert not any(np.isnan(processed[0]))
from sagemaker_sklearn_extension.impute import RobustImputer from sagemaker_sklearn_extension.impute import RobustMissingIndicator from sagemaker_sklearn_extension.preprocessing import LogExtremeValuesTransformer from sagemaker_sklearn_extension.preprocessing import NALabelEncoder from sagemaker_sklearn_extension.preprocessing import QuadraticFeatures from sagemaker_sklearn_extension.preprocessing import QuantileExtremeValuesTransformer from sagemaker_sklearn_extension.preprocessing import RemoveConstantColumnsTransformer from sagemaker_sklearn_extension.preprocessing import RobustLabelEncoder from sagemaker_sklearn_extension.preprocessing import RobustStandardScaler from sagemaker_sklearn_extension.preprocessing import ThresholdOneHotEncoder @pytest.mark.parametrize( "Estimator", [ DateTimeVectorizer(), LogExtremeValuesTransformer(), MultiColumnTfidfVectorizer(), NALabelEncoder(), QuadraticFeatures(), QuantileExtremeValuesTransformer(), RobustImputer(), RemoveConstantColumnsTransformer(), RobustLabelEncoder(), RobustMissingIndicator(), RobustStandardScaler(), ThresholdOneHotEncoder(), ], ) def test_all_estimators(Estimator): return check_estimator(Estimator)