Esempio n. 1
0
def test_fit_eliminates_constant_columns_multicol_input(mode):
    # set up data. Properties:
    # Hour: Constant thrghout - eliminate
    # Year: Constant in both, but has different value accross columns - should eliminate
    # Month: Constant in column 2, not in 1 - should not eliminate
    # Day of month: not constant in both columns - should not eliminate
    col1 = [
        parser.parse("Jan 5th, 2012"),
        parser.parse("Feb 2, 2012"),
        parser.parse("Jan 1st, 2012"),
    ]
    col2 = [
        parser.parse("Dec 2th, 2013"),
        parser.parse("Dec 3th, 2013"),
        parser.parse("Dec 3th, 2013"),
    ]

    cur_data = np.array([col1, col2]).T

    dtv = DateTimeVectorizer(
        mode=mode,
        extract=[
            DateTimeDefinition.HOUR.value,
            DateTimeDefinition.DAY_OF_MONTH.value,
            DateTimeDefinition.YEAR.value,
            DateTimeDefinition.MONTH.value,
        ],
    )
    # taking only odd items. Year and month are always the same.
    dtv = dtv.fit(cur_data)
    # Year and month are constants, make sure they are out
    assert dtv.extract_ == [
        DateTimeDefinition.DAY_OF_MONTH.value, DateTimeDefinition.MONTH.value
    ]
def test_fit_transform_cyclic_leaves_year():
    extract_keys = [k for k in dir(DateTimeDefinition) if not k.startswith("_")]
    extract = [DateTimeDefinition.__dict__[k].value for k in extract_keys]

    dtv = DateTimeVectorizer(mode="cyclic", extract=extract, ignore_constant_columns=False)
    output = dtv.fit_transform(data)

    loc_year = extract_keys.index("YEAR")
    loc_year *= 2
    np.testing.assert_array_equal(output[:, loc_year], np.array([2012, 2011, 2012, 2012, 2012, 2018]))

    assert output.shape[1] == len(dtv.extract_) * 2 - 1
Esempio n. 3
0
def test_fit_transform_works_with_non_np_input(mode):
    dtv = DateTimeVectorizer(
        mode=mode,
        extract=[
            DateTimeDefinition.HOUR.value,
            DateTimeDefinition.SECOND.value,
            DateTimeDefinition.YEAR.value,
            DateTimeDefinition.MONTH.value,
        ],
    )
    output = dtv.fit_transform(data_array)
    assert output.shape[0] == len(data_array)
    assert output.shape[1] > 1
def test_transform_categorical():
    extract_keys = [k for k in dir(DateTimeDefinition) if not k.startswith("_")]
    extract = [DateTimeDefinition.__dict__[k].value for k in extract_keys]
    dtv = DateTimeVectorizer(mode="ordinal", extract=extract, ignore_constant_columns=False)
    dtv.fit(data)
    output = dtv.transform(data)

    assert np.all(output >= 0)

    loc_year = extract_keys.index("YEAR")
    np.testing.assert_array_equal(output[:, loc_year], np.array([2012, 2011, 2012, 2012, 2012, 2018]))

    loc_month = extract_keys.index("MONTH")
    np.testing.assert_array_equal(output[:, loc_month], np.array([0, 1, 0, 11, 0, 0]))
def test_fit_eliminates_constant_columns():
    dtv = DateTimeVectorizer(
        mode="ordinal",
        extract=[
            DateTimeDefinition.HOUR.value,
            DateTimeDefinition.SECOND.value,
            DateTimeDefinition.YEAR.value,
            DateTimeDefinition.MONTH.value,
        ],
    )
    # taking only odd items. Year and month are always the same.
    cur_data = data.reshape((-1, 2))[:, 0].reshape((-1, 1))
    dtv = dtv.fit(cur_data)
    # Year and month are constants, make sure they are out
    assert dtv.extract_ == [DateTimeDefinition.HOUR.value, DateTimeDefinition.SECOND.value]
Esempio n. 6
0
def test_cyclic_transform_outputs_correct_cyclic_values(data_shape):
    size = int(np.prod(data_shape))
    data = np.arange(size).reshape(data_shape)
    ret = DateTimeVectorizer._cyclic_transform(data, low=0, high=size - 1)
    ret = ret.reshape((-1, 2))
    ret = ret**2
    assert np.linalg.norm(np.sum(ret, axis=1) - 1) < 1e-8
Esempio n. 7
0
def test_date_time_vectorizer():
    from sagemaker_sklearn_extension.feature_extraction.date_time import DateTimeVectorizer

    st_helper = SklearnTestHelper()

    data = np.array(
        [
            "Jan 3th, 2018, 1:34am",
            "Feb 11th, 2012, 11:34:59pm",
        ]
    ).reshape((-1, 1))

    translated_data = np.array(
        [[3, 2018, 1, 34, 0, 1, 1], [6, 2012, 23, 34, 59, 2, 6]], dtype=np.float32
    )

    date_time = DateTimeVectorizer(mode="ordinal", ignore_constant_columns=True)
    python_out = date_time.fit_transform(data)

    dshape = (relay.Any(), 7)
    st_helper.compile(date_time, dshape, "float32", "transform")

    tvm_out = st_helper.run(translated_data)
    tvm.testing.assert_allclose(python_out, tvm_out, rtol=1e-5, atol=1e-5)

    date_time = DateTimeVectorizer(mode="cyclic", ignore_constant_columns=True)
    python_out = date_time.fit_transform(data)

    dshape = (relay.Any(), 7)
    st_helper.compile(date_time, dshape, "float32", "transform")
Esempio n. 8
0
def test_cyclic_transform_outputs_correct_shape(data_shape):
    size = int(np.prod(data_shape))
    data = np.arange(size).reshape(data_shape)
    ret = DateTimeVectorizer._cyclic_transform(data, low=0, high=size - 1)

    new_shape = list(data_shape)
    new_shape[-1] *= 2
    new_shape = tuple(new_shape)
    assert ret.shape == new_shape

    ret = ret.reshape((-1, 2))
    ret = ret**2
    assert np.linalg.norm(np.sum(ret, axis=1) - 1) < 1e-8
Esempio n. 9
0
def test_fit_transform_default_datetime():
    cur_data_array = [["Monday"], ["Tuesday"], ["Friday"]]

    dtv = DateTimeVectorizer(mode="ordinal",
                             ignore_constant_columns=False,
                             default_datetime=datetime(1900, 1, 1))
    processed = dtv.fit_transform(cur_data_array)
    year_location = dtv.extract_.index(DateTimeDefinition.YEAR.value)
    month_location = dtv.extract_.index(DateTimeDefinition.MONTH.value)
    weekday_location = dtv.extract_.index(DateTimeDefinition.WEEKDAY.value)

    assert processed[0, year_location] == 1900
    assert processed[0, month_location] == 0
    assert processed[0, weekday_location] == 0

    assert processed[1, year_location] == 1900
    assert processed[1, month_location] == 0
    assert processed[1, weekday_location] == 1

    assert processed[2, year_location] == 1900
    assert processed[2, month_location] == 0
    assert processed[2, weekday_location] == 4
Esempio n. 10
0
def build_feature_transform():
    """ Returns the model definition representing feature processing."""

    # These features can be parsed as numeric.

    numeric = HEADER.as_feature_indices([
        'age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
        'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'
    ])

    # These features contain a relatively small number of unique items.

    categorical = HEADER.as_feature_indices([
        'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
        'poutcome'
    ])

    # These features can be parsed as date or time.

    datetime = HEADER.as_feature_indices(['month', 'day_of_week'])

    numeric_processors = Pipeline(
        steps=[('robustimputer',
                RobustImputer(strategy='constant', fill_values=nan))])

    categorical_processors = Pipeline(steps=[('thresholdonehotencoder',
                                              ThresholdOneHotEncoder(
                                                  threshold=301))])

    datetime_processors = Pipeline(
        steps=[('datetimevectorizer', DateTimeVectorizer(mode='ordinal'))])

    column_transformer = ColumnTransformer(transformers=[(
        'numeric_processing', numeric_processors, numeric
    ), ('categorical_processing', categorical_processors,
        categorical), ('datetime_processing', datetime_processors, datetime)])

    return Pipeline(steps=[(
        'column_transformer',
        column_transformer), ('robuststandardscaler', RobustStandardScaler())])
def test_fit_transform_accepts_mixed_str_datetime():
    cur_data_array = data_array + [["Feb 12th, 15:33, 2011"], ["Nov 5th, 1am, 1975"], [432], [None], ["Feb 45th, 2018"]]

    dtv = DateTimeVectorizer(mode="ordinal")
    processed = dtv.fit_transform(cur_data_array)
    year_location = dtv.extract_.index(DateTimeDefinition.YEAR.value)
    assert processed[0, year_location] == 2012
    assert processed[-4, year_location] == 1975
    assert np.isnan(processed[-3, year_location])
    assert np.isnan(processed[-2, year_location])
    assert np.isnan(processed[-1, year_location])

    dtv = DateTimeVectorizer(mode="cyclic")
    processed = dtv.fit_transform(cur_data_array)
    assert all(np.isnan(processed[-1]))
    assert not any(np.isnan(processed[-4]))
    assert not any(np.isnan(processed[0]))
from sagemaker_sklearn_extension.impute import RobustImputer
from sagemaker_sklearn_extension.impute import RobustMissingIndicator
from sagemaker_sklearn_extension.preprocessing import LogExtremeValuesTransformer
from sagemaker_sklearn_extension.preprocessing import NALabelEncoder
from sagemaker_sklearn_extension.preprocessing import QuadraticFeatures
from sagemaker_sklearn_extension.preprocessing import QuantileExtremeValuesTransformer
from sagemaker_sklearn_extension.preprocessing import RemoveConstantColumnsTransformer
from sagemaker_sklearn_extension.preprocessing import RobustLabelEncoder
from sagemaker_sklearn_extension.preprocessing import RobustStandardScaler
from sagemaker_sklearn_extension.preprocessing import ThresholdOneHotEncoder


@pytest.mark.parametrize(
    "Estimator",
    [
        DateTimeVectorizer(),
        LogExtremeValuesTransformer(),
        MultiColumnTfidfVectorizer(),
        NALabelEncoder(),
        QuadraticFeatures(),
        QuantileExtremeValuesTransformer(),
        RobustImputer(),
        RemoveConstantColumnsTransformer(),
        RobustLabelEncoder(),
        RobustMissingIndicator(),
        RobustStandardScaler(),
        ThresholdOneHotEncoder(),
    ],
)
def test_all_estimators(Estimator):
    return check_estimator(Estimator)