Ejemplo n.º 1
0
def test_pass_categorical_and_numeric_columns_to_pipeline(
        dask_client, data_type, include_categorical):

    # Prepare the training data
    X, y = sklearn.datasets.make_classification()
    feat_type = None
    if 'pandas' in data_type:
        X = pd.DataFrame(X)
        y = pd.DataFrame(y, dtype="category")
        if include_categorical:
            cat_name = X.shape[1]
            X[cat_name] = 'A'
            X[cat_name] = X[cat_name].astype('category')
    elif 'numpy' in data_type:
        if include_categorical:
            feat_type = ['numerical' for x in range(np.shape(X)[1])]
            feat_type.append('categorical')
            temporal = np.zeros((X.shape[0], X.shape[1] + 1))
            temporal[:, :-1] = X
            X = temporal
    else:
        pytest.fail()

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X, y, test_size=0.5, random_state=3)

    seed = 3
    automl = AutoSklearnClassifier(
        time_left_for_this_task=120,
        # Time left for task plays no role
        # only per run time limit
        per_run_time_limit=30,
        ensemble_size=0,
        dask_client=dask_client,
        include_estimators=['random_forest'],
        seed=seed,
    )
    config = automl.get_configuration_space(
        X_train,
        y_train,
        feat_type=feat_type,
        X_test=X_test,
        y_test=y_test,
    ).get_default_configuration()

    pipeline, run_info, run_value = automl.fit_pipeline(X=X_train,
                                                        y=y_train,
                                                        config=config,
                                                        feat_type=feat_type,
                                                        X_test=X_test,
                                                        y_test=y_test)

    # We should produce a decent result
    assert run_value.cost < 0.4, f"{run_value}/{run_value.additional_info}"
    prediction = pipeline.predict(
        automl.automl_.InputValidator.feature_validator.transform(X))
    assert np.shape(prediction)[0], np.shape(y)[0]

    if include_categorical:
        expected_dict = {i: 'numerical' for i in range(np.shape(X)[1] - 1)}
        expected_dict[X.shape[1] - 1] = 'categorical'
    else:
        expected_dict = {i: 'numerical' for i in range(np.shape(X)[1])}
    assert expected_dict == pipeline.named_steps[
        'data_preprocessing'].feat_type
def test_pass_categorical_and_numeric_columns_to_pipeline(
        dask_client, data_type, include_categorical):

    # Prepare the training data
    X, y = sklearn.datasets.make_classification(random_state=0)
    X = cast(np.ndarray, X)

    n_features = X.shape[1]

    # If categorical, insert a row of 'categorical' '0's at last col
    if include_categorical:
        X = np.insert(X, n_features, values=0, axis=1)

    if data_type == 'pandas':
        X = pd.DataFrame(X)
        y = pd.DataFrame(y, dtype="category")

        # Set the last column to categorical
        if include_categorical:
            X.loc[:, n_features] = X.loc[:, n_features].astype(
                'category')  # type: ignore

    # Specify the feature_types
    if data_type == 'numpy' and include_categorical:
        feat_type = ['numerical'] * n_features + ['categorical']
    else:
        feat_type = None

    # Create the splits
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.5, random_state=3)

    # Create Estimator
    # Time left for task plays no role for fit_pipeline
    automl = AutoSklearnClassifier(
        delete_tmp_folder_after_terminate=False,
        time_left_for_this_task=120,
        per_run_time_limit=30,
        ensemble_size=0,
        seed=0,
        dask_client=dask_client,
        include={'classifier': ['random_forest']},
    )

    config_space = automl.get_configuration_space(
        X_train,
        y_train,
        X_test=X_test,
        y_test=y_test,
        feat_type=feat_type,
    )
    config = config_space.get_default_configuration()

    pipeline, _, run_value = automl.fit_pipeline(
        X=X_train,
        y=y_train,
        X_test=X_test,
        y_test=y_test,
        config=config,
        feat_type=feat_type,
    )

    assert pipeline is not None, "Expected a pipeline from automl.fit_pipeline"

    feature_validator = automl.automl_.InputValidator.feature_validator  # type: ignore
    transformed_X_test = feature_validator.transform(X_test)
    predictions = pipeline.predict(transformed_X_test)

    # We should produce a half decent result
    assert run_value.cost < 0.40, f"Run value:\n {run_value}"

    # Outputs should be the correct length
    assert np.shape(predictions)[0] == np.shape(y_test)[0]

    n_columns = np.shape(X)[1]

    if include_categorical:
        expected_feat_types = {
            i: feature_type
            for i, feature_type in enumerate(['numerical'] * (n_columns - 1) +
                                             ['categorical'])
        }

    else:
        expected_feat_types = {
            i: feature_type
            for i, feature_type in enumerate(['numerical'] * n_columns)
        }

    pipeline_feat_types = pipeline.named_steps[
        'data_preprocessor'].choice.feat_type
    assert expected_feat_types == pipeline_feat_types