Beispiel #1
0
def test_automl():
    st_helper = SklearnTestHelper()

    data = np.array(
        [[4, 5, np.nan, 7], [0, np.nan, 2, 3], [8, 9, 10, 11],
         [np.nan, 13, 14, 15]],
        dtype=np.float32,
    )

    pipeline = Pipeline(
        steps=[("robustimputer",
                RobustImputer(fill_values=np.nan, strategy="constant"))])

    ct = ColumnTransformer(transformers=[("numeric_processing", pipeline,
                                          [0, 1, 2, 3])])
    ct.fit(data)

    pipeline = Pipeline(steps=[("column_transformer", ct)])
    header = Header(column_names=["x1", "x2", "x3", "class"],
                    target_column_name="class")

    na = NALabelEncoder()
    na.fit(data)

    automl_transformer = AutoMLTransformer(header, pipeline, na)

    dshape = (relay.Any(), relay.Any())
    _test_model_impl(st_helper, automl_transformer, dshape, data, auto_ml=True)
Beispiel #2
0
def test_na_label_encoder():
    st_helper = SklearnTestHelper()
    nle = NALabelEncoder()
    i_put = np.array([[1, 2, 2, 6]], dtype=np.float32)
    nle.fit(i_put)
    data = np.array([[np.nan, 0, 1, 2, 6]], dtype=np.float32)
    dshape = (relay.Any(), len(data))
    _test_model_impl(st_helper, nle, dshape, data)
def test_automl_transformer_regression():
    """Tests that rows in a regression dataset where the target column is not a finite numeric are imputed"""
    data = read_csv_data(source="test/data/csv/regression_na_labels.csv")
    X = data[:, :3]
    y = data[:, 3]
    header = Header(column_names=["x1", "x2", "x3", "class"],
                    target_column_name="class")
    automl_transformer = AutoMLTransformer(
        header=header,
        feature_transformer=RobustImputer(strategy="constant", fill_values=0),
        target_transformer=NALabelEncoder(),
    )
    model = automl_transformer.fit(X, y)
    X_transformed = model.transform(X)
    assert X_transformed.shape == X.shape

    Xy = np.concatenate((X, y.reshape(-1, 1)), axis=1)

    Xy_transformed = model.transform(Xy)
    assert Xy_transformed.shape == (3, 4)
    assert np.array_equal(
        Xy_transformed,
        np.array([[1.1, 1.0, 2.0, 3.0], [2.2, 4.0, 0.0, 5.0],
                  [3.3, 12.0, 13.0, 14.0]]))
def test_na_label_encoder(y, y_expected):
    na_label_encoder = NALabelEncoder()
    na_label_encoder.fit(y)
    y_transform = na_label_encoder.transform(y)
    assert_array_equal(y_transform, y_expected)
from sagemaker_sklearn_extension.impute import RobustMissingIndicator
from sagemaker_sklearn_extension.preprocessing import LogExtremeValuesTransformer
from sagemaker_sklearn_extension.preprocessing import NALabelEncoder
from sagemaker_sklearn_extension.preprocessing import QuadraticFeatures
from sagemaker_sklearn_extension.preprocessing import QuantileExtremeValuesTransformer
from sagemaker_sklearn_extension.preprocessing import RemoveConstantColumnsTransformer
from sagemaker_sklearn_extension.preprocessing import RobustLabelEncoder
from sagemaker_sklearn_extension.preprocessing import RobustStandardScaler
from sagemaker_sklearn_extension.preprocessing import ThresholdOneHotEncoder


@pytest.mark.parametrize(
    "Estimator",
    [
        DateTimeVectorizer(),
        LogExtremeValuesTransformer(),
        MultiColumnTfidfVectorizer(),
        NALabelEncoder(),
        QuadraticFeatures(),
        QuantileExtremeValuesTransformer(),
        RobustImputer(),
        RemoveConstantColumnsTransformer(),
        RobustLabelEncoder(),
        RobustMissingIndicator(),
        RobustStandardScaler(),
        ThresholdOneHotEncoder(),
    ],
)
def test_all_estimators(Estimator):
    return check_estimator(Estimator)

def to_csr(X):
    return csr_matrix(X.shape, dtype=np.int8)


impute_pca_pipeline = Pipeline(
    steps=[("impute", SimpleImputer()), ("pca", PCA(n_components=2))])


@pytest.mark.parametrize(
    "feature_transformer, target_transformer, "
    "expected_X_transformed_shape, expected_Xy_transformed_shape",
    [
        (impute_pca_pipeline, LabelEncoder(), (10, 2), (10, 3)),
        (impute_pca_pipeline, NALabelEncoder(), (10, 2), (9, 3)),
        (FunctionTransformer(to_csr, validate=False), None, (10, 3), (9, 4)),
    ],
)
def test_automl_transformer(feature_transformer, target_transformer,
                            expected_X_transformed_shape,
                            expected_Xy_transformed_shape):
    X = np.arange(0, 3 * 10).reshape((10, 3)).astype(np.str)
    y = np.array([0] * 5 + [1] * 4 + [np.nan]).astype(np.str)

    header = Header(column_names=["x1", "x2", "x3", "class"],
                    target_column_name="class")
    automl_transformer = AutoMLTransformer(
        header=header,
        feature_transformer=feature_transformer,
        target_transformer=target_transformer,