Beispiel #1
0
 def score_pipelines(pipelines, engine):
     futures = []
     for pipeline in pipelines:
         futures.append(engine.submit_training_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                   automl_config=automl_data, pipeline=pipeline))
     pipelines = [f.get_result() for f in futures]
     futures = []
     for pipeline in pipelines:
         futures.append(engine.submit_scoring_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                  automl_config=automl_data, pipeline=pipeline,
                                                  objectives=[automl_data.objective]))
     results = [f.get_result() for f in futures]
     return results
Beispiel #2
0
def test_compute_final_component_features_nonlinear(mock_en_predict, mock_rf_predict, mock_ohe, mock_imputer, example_graph, X_y_binary):
    X, y = X_y_binary
    mock_imputer.return_value = ww.DataTable(pd.DataFrame(X))
    mock_ohe.return_value = ww.DataTable(pd.DataFrame(X))
    mock_en_predict.return_value = ww.DataColumn(pd.Series(np.ones(X.shape[0])))
    mock_rf_predict.return_value = ww.DataColumn(pd.Series(np.zeros(X.shape[0])))
    X_expected = pd.DataFrame({'Random Forest': np.zeros(X.shape[0]), 'Elastic Net': np.ones(X.shape[0])})
    component_graph = ComponentGraph(example_graph).instantiate({})
    component_graph.fit(X, y)

    X_t = component_graph.compute_final_component_features(X)
    assert_frame_equal(X_expected, X_t.to_dataframe())
    assert mock_imputer.call_count == 2
    assert mock_ohe.call_count == 4
Beispiel #3
0
def test_delay_feature_transformer_woodwork_custom_overrides_returned_by_components(X_df, fit_transform):
    y = pd.Series([1, 2, 1])
    override_types = [Integer, Double, Categorical, Datetime, Boolean]
    for logical_type in override_types:
        try:
            X = ww.DataTable(X_df, logical_types={0: logical_type})
        except TypeError:
            continue
        dft = DelayedFeatureTransformer(max_delay=1, gap=11)
        if fit_transform:
            transformed = dft.fit_transform(X, y)
        else:
            dft.fit(X, y)
            transformed = dft.transform(X, y)
        assert isinstance(transformed, ww.DataTable)
        if logical_type in [Integer, Double, Categorical]:
            assert transformed.logical_types == {0: logical_type,
                                                 '0_delay_1': Double,
                                                 'target_delay_0': Integer,
                                                 'target_delay_1': Double}
        else:
            assert transformed.logical_types == {0: logical_type,
                                                 '0_delay_1': logical_type,
                                                 'target_delay_0': Integer,
                                                 'target_delay_1': Double}
Beispiel #4
0
def test_rename_column_names_to_numeric():
    X = np.array([[1, 2], [3, 4]])
    pd.testing.assert_frame_equal(_rename_column_names_to_numeric(X),
                                  pd.DataFrame(X))

    X = pd.DataFrame({"<>": [1, 2], ">>": [2, 4]})
    pd.testing.assert_frame_equal(_rename_column_names_to_numeric(X),
                                  pd.DataFrame({
                                      0: [1, 2],
                                      1: [2, 4]
                                  }))

    X = ww.DataTable(pd.DataFrame({
        "<>": [1, 2],
        ">>": [2, 4]
    }),
                     logical_types={
                         "<>": "categorical",
                         ">>": "categorical"
                     })
    X_renamed = _rename_column_names_to_numeric(X)
    X_expected = pd.DataFrame({
        0: pd.Series([1, 2], dtype="category"),
        1: pd.Series([2, 4], dtype="category")
    })
    pd.testing.assert_frame_equal(X_renamed.to_dataframe(), X_expected)
    assert X_renamed.logical_types == {
        0: ww.logical_types.Categorical,
        1: ww.logical_types.Categorical
    }
Beispiel #5
0
def test_binary_classification_predictions_thresholded_properly(
        mock_predict, mock_predict_proba, mock_obj_decision, mock_decode,
        X_y_binary, dummy_ts_binary_pipeline_class):
    mock_objs = [mock_decode, mock_predict]
    mock_decode.return_value = pd.Series([0, 1])
    X, y = X_y_binary
    binary_pipeline = dummy_ts_binary_pipeline_class(
        parameters={
            "Logistic Regression Classifier": {
                "n_jobs": 1
            },
            "pipeline": {
                "gap": 0,
                "max_delay": 0
            }
        })
    # test no objective passed and no custom threshold uses underlying estimator's predict method
    binary_pipeline.fit(X, y)
    binary_pipeline.predict(X, y)
    for mock_obj in mock_objs:
        mock_obj.assert_called()
        mock_obj.reset_mock()

    # test objective passed but no custom threshold uses underlying estimator's predict method
    binary_pipeline.predict(X, y, 'precision')
    for mock_obj in mock_objs:
        mock_obj.assert_called()
        mock_obj.reset_mock()

    mock_objs = [mock_decode, mock_predict_proba]
    # test custom threshold set but no objective passed
    mock_predict_proba.return_value = ww.DataTable(
        pd.DataFrame([[0.1, 0.2], [0.1, 0.2]]))
    binary_pipeline.threshold = 0.6
    binary_pipeline._encoder.classes_ = [0, 1]
    binary_pipeline.predict(X, y)
    for mock_obj in mock_objs:
        mock_obj.assert_called()
        mock_obj.reset_mock()
    mock_obj_decision.assert_not_called()
    mock_predict.assert_not_called()

    # test custom threshold set but no objective passed
    binary_pipeline.threshold = 0.6
    binary_pipeline.predict(X, y)
    for mock_obj in mock_objs:
        mock_obj.assert_called()
        mock_obj.reset_mock()
    mock_obj_decision.assert_not_called()
    mock_predict.assert_not_called()

    # test custom threshold set and objective passed
    binary_pipeline.threshold = 0.6
    mock_obj_decision.return_value = pd.Series([1.])
    binary_pipeline.predict(X, y, 'precision')
    for mock_obj in mock_objs:
        mock_obj.assert_called()
        mock_obj.reset_mock()
    mock_predict.assert_not_called()
    mock_obj_decision.assert_called()
def test_transform(X_y_binary, X_y_multi, X_y_regression):
    datasets = locals()
    for dataset in datasets.values():
        X, y = dataset
        X_pd = pd.DataFrame(X)
        X_pd.columns = X_pd.columns.astype(str)
        es = ft.EntitySet()
        es = es.entity_from_dataframe(entity_id="X",
                                      dataframe=X_pd,
                                      index='index',
                                      make_index=True)
        matrix, features = ft.dfs(entityset=es, target_entity="X")

        feature = DFSTransformer()
        feature.fit(X)
        X_feature_matrix = feature.transform(X)

        pd.testing.assert_frame_equal(matrix, X_feature_matrix)
        assert features == feature.features

        feature.fit(X, y)
        feature.transform(X)

        X_ww = ww.DataTable(X_pd)
        feature.fit(X_ww)
        feature.transform(X_ww)
def test_outliers_data_check_input_formats():
    outliers_check = OutliersDataCheck()

    # test empty pd.DataFrame
    assert outliers_check.validate(pd.DataFrame()) == {"warnings": [], "errors": []}

    # test np.array
    a = np.arange(10) * 0.01
    data = np.tile(a, (100, 10))

    X = pd.DataFrame(data=data)
    X.iloc[0, 3] = 1000
    X.iloc[3, 25] = 1000
    X.iloc[5, 55] = 10000
    X.iloc[10, 72] = -1000

    outliers_check = OutliersDataCheck()
    assert outliers_check.validate(X.to_numpy()) == {
        "warnings": [DataCheckWarning(message="Column(s) '3', '25', '55', '72' are likely to have outlier data.",
                                      data_check_name=outliers_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_OUTLIERS,
                                      details={"columns": [3, 25, 55, 72]}).to_dict()],
        "errors": []
    }

    # test Woodwork
    outliers_check = OutliersDataCheck()
    assert outliers_check.validate(ww.DataTable(X)) == {
        "warnings": [DataCheckWarning(message="Column(s) '3', '25', '55', '72' are likely to have outlier data.",
                                      data_check_name=outliers_data_check_name,
                                      message_code=DataCheckMessageCode.HAS_OUTLIERS,
                                      details={"columns": [3, 25, 55, 72]}).to_dict()],
        "errors": []
    }
def test_highly_null_data_check_input_formats():
    highly_null_check = HighlyNullDataCheck(pct_null_threshold=0.8)

    # test empty pd.DataFrame
    assert highly_null_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []}

    #  test Woodwork
    ww_input = ww.DataTable(pd.DataFrame([[None, None, None, None, 0], [None, None, None, "hi", 5]]))
    assert highly_null_check.validate(ww_input) == {
        "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": 0}).to_dict(),
                     DataCheckWarning(message="Column '1' is 80.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": 1}).to_dict(),
                     DataCheckWarning(message="Column '2' is 80.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": 2}).to_dict()],
        "errors": [],
        "actions": []
    }

    #  test 2D list
    assert highly_null_check.validate([[None, None, None, None, 0], [None, None, None, "hi", 5]]) == {
        "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": 0}).to_dict(),
                     DataCheckWarning(message="Column '1' is 80.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": 1}).to_dict(),
                     DataCheckWarning(message="Column '2' is 80.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": 2}).to_dict()],
        "errors": [],
        "actions": []
    }

    # test np.array
    assert highly_null_check.validate(np.array([[None, None, None, None, 0], [None, None, None, "hi", 5]])) == {
        "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": 0}).to_dict(),
                     DataCheckWarning(message="Column '1' is 80.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": 1}).to_dict(),
                     DataCheckWarning(message="Column '2' is 80.0% or more null",
                                      data_check_name=highly_null_data_check_name,
                                      message_code=DataCheckMessageCode.HIGHLY_NULL,
                                      details={"column": 2}).to_dict()],
        "errors": [],
        "actions": []
    }
def test_feature_selectors_woodwork_custom_overrides_returned_by_components(
        X_df):
    rf_classifier, rf_regressor = make_rf_feature_selectors()
    y = pd.Series([1, 2, 1])
    X_df['another column'] = pd.Series([1., 2., 3.], dtype="float")
    override_types = [Integer, Double, Boolean]
    for logical_type in override_types:
        try:
            X = ww.DataTable(X_df, logical_types={0: logical_type})
        except TypeError:
            continue

        rf_classifier.fit(X, y)
        transformed = rf_classifier.transform(X, y)
        assert isinstance(transformed, ww.DataTable)
        assert transformed.logical_types == {
            0: logical_type,
            'another column': Double
        }

        rf_regressor.fit(X, y)
        transformed = rf_regressor.transform(X, y)
        assert isinstance(transformed, ww.DataTable)
        assert transformed.logical_types == {
            0: logical_type,
            'another column': Double
        }
Beispiel #10
0
def test_make_pipeline_only_text_columns(input_type, problem_type):
    X = pd.DataFrame({"text": ["string one", "the evalml team is full of wonderful people", "text for a column, this should be a text column!!", "text string", "hello world"],
                      "another text": ["ladidididididida", "cats are great", "text for a column, this should be a text column!!", "text string", "goodbye world"]})
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)

    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            delayed_features = []
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            standard_scaler = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                standard_scaler = [StandardScaler]
            assert pipeline.component_graph == [TextFeaturizer] + delayed_features + standard_scaler + [estimator_class]
Beispiel #11
0
def test_compute_final_component_features_linear(mock_ohe, mock_imputer, X_y_binary):
    X, y = X_y_binary
    X = pd.DataFrame(X)
    X_expected = X.fillna(0)
    mock_imputer.return_value = ww.DataTable(X)
    mock_ohe.return_value = ww.DataTable(X_expected)

    component_list = ['Imputer', 'One Hot Encoder', 'Random Forest Classifier']
    component_graph = ComponentGraph().from_list(component_list)
    component_graph.instantiate({})
    component_graph.fit(X, y)

    X_t = component_graph.compute_final_component_features(X)
    assert_frame_equal(X_expected, X_t.to_dataframe())
    assert mock_imputer.call_count == 2
    assert mock_ohe.call_count == 2
Beispiel #12
0
def _rename_column_names_to_numeric(X, flatten_tuples=True):
    """Used in LightGBM and XGBoost estimator classes to rename column names
        when the input is a pd.DataFrame in case it has column names that contain symbols ([, ], <)
        that these estimators cannot natively handle.

    Arguments:
        X (pd.DataFrame): The input training data of shape [n_samples, n_features]
        flatten_tuples (bool): Whether to flatten MultiIndex or tuple column names. LightGBM cannot handle columns with tuple names.

    Returns:
        Transformed X where column names are renamed to numerical values
    """
    if isinstance(X, (np.ndarray, list)):
        return pd.DataFrame(X)

    if isinstance(X, ww.DataTable):
        X_t = X.to_dataframe()
    else:
        X_t = X.copy()

    if flatten_tuples and (len(X_t.columns) > 0 and isinstance(X_t.columns, pd.MultiIndex)):
        flat_col_names = list(map(str, X_t.columns))
        X_t.columns = flat_col_names
        rename_cols_dict = dict((str(col), col_num) for col_num, col in enumerate(list(X.columns)))
    else:
        rename_cols_dict = dict((col, col_num) for col_num, col in enumerate(list(X.columns)))
    X_renamed = X_t.rename(columns=rename_cols_dict)
    if isinstance(X, ww.DataTable):
        X_renamed = ww.DataTable(X_renamed)
    return X_renamed
Beispiel #13
0
def test_make_pipeline_all_nan_no_categoricals(input_type, problem_type):
    # testing that all_null column is not considered categorical
    X = pd.DataFrame({"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan],
                      "num": [1, 2, 3, 4, 5]})
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [estimator_class]
            assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
Beispiel #14
0
def test_partial_dependence_with_non_numeric_columns(
        data_type, linear_regression_pipeline_class,
        logistic_regression_binary_pipeline_class):
    X = pd.DataFrame({
        'numeric': [1, 2, 3, 0],
        'also numeric': [2, 3, 4, 1],
        'string': ['a', 'b', 'a', 'c'],
        'also string': ['c', 'b', 'a', 'd']
    })
    if data_type == "ww":
        X = ww.DataTable(X)
    y = [0, 0.2, 1.4, 1]
    pipeline = linear_regression_pipeline_class(
        parameters={"Linear Regressor": {
            "n_jobs": 1
        }})
    pipeline.fit(X, y)
    part_dep = partial_dependence(pipeline, X, features='numeric')
    assert list(part_dep.columns) == ["feature_values", "partial_dependence"]
    assert len(part_dep["partial_dependence"]) == 4
    assert len(part_dep["feature_values"]) == 4
    assert not part_dep.isnull().any(axis=None)

    part_dep = partial_dependence(pipeline, X, features='string')
    assert list(part_dep.columns) == ["feature_values", "partial_dependence"]
    assert len(part_dep["partial_dependence"]) == 3
    assert len(part_dep["feature_values"]) == 3
    assert not part_dep.isnull().any(axis=None)
Beispiel #15
0
def test_make_pipeline_text_columns(input_type, problem_type):
    X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2],
                      "categorical": ["a", "b", "a", "c", "c"],
                      "text": ["string one", "another", "text for a column, this should be a text column!!", "text string", "hello world"]})
    y = pd.Series([0, 0, 1, 1, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)

    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [OneHotEncoder, estimator_class]
            assert pipeline.component_graph == [Imputer, TextFeaturizer] + delayed_features + estimator_components
Beispiel #16
0
def test_make_pipeline_datetime_no_categorical(input_type, problem_type):
    X = pd.DataFrame({"numerical": [1, 2, 3, 1, 2],
                      "some dates": pd.date_range('2000-02-03', periods=5, freq='W')})
    y = pd.Series([0, 1, 1, 0, 0])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [estimator_class]
            assert pipeline.component_graph == [Imputer, DateTimeFeaturizer] + delayed_features + estimator_components
Beispiel #17
0
 def eval_pipelines(pipelines, engine):
     futures = []
     for pipeline in pipelines:
         futures.append(engine.submit_evaluation_job(X=ww.DataTable(X), y=ww.DataColumn(y),
                                                     automl_config=automl_data, pipeline=pipeline))
     results = [f.get_result() for f in futures]
     return results
Beispiel #18
0
def test_make_pipeline_no_column_names(input_type, problem_type):
    X = pd.DataFrame([[1, "a", np.nan], [2, "b", np.nan], [5, "b", np.nan]])
    y = pd.Series([0, 0, 1])
    if input_type == 'ww':
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
    estimators = get_estimators(problem_type=problem_type)
    pipeline_class = _get_pipeline_base_class(problem_type)
    if problem_type == ProblemTypes.MULTICLASS:
        y = pd.Series([0, 2, 1, 2])

    for estimator_class in estimators:
        if problem_type in estimator_class.supported_problem_types:
            pipeline = make_pipeline(X, y, estimator_class, problem_type)
            assert isinstance(pipeline, type(pipeline_class))
            assert pipeline.custom_hyperparameters is None
            if is_time_series(problem_type):
                delayed_features = [DelayedFeatureTransformer]
            else:
                delayed_features = []
            if estimator_class.model_family == ModelFamily.LINEAR_MODEL:
                estimator_components = [OneHotEncoder, StandardScaler, estimator_class]
            elif estimator_class.model_family == ModelFamily.CATBOOST:
                estimator_components = [estimator_class]
            else:
                estimator_components = [OneHotEncoder, estimator_class]
            assert pipeline.component_graph == [DropNullColumns, Imputer] + delayed_features + estimator_components
Beispiel #19
0
def test_read_csv_with_woodwork_params(sample_df_pandas, tmpdir):
    filepath = os.path.join(tmpdir, 'sample.csv')
    sample_df_pandas.to_csv(filepath, index=False)
    logical_types = {
        'full_name': 'NaturalLanguage',
        'phone_number': 'PhoneNumber'
    }
    semantic_tags = {
        'age': ['tag1', 'tag2'],
        'is_registered': ['tag3', 'tag4']
    }
    dt_from_csv = ww.read_csv(filepath=filepath,
                              index='id',
                              time_index='signup_date',
                              logical_types=logical_types,
                              semantic_tags=semantic_tags)
    dt = ww.DataTable(sample_df_pandas,
                      index='id',
                      time_index='signup_date',
                      logical_types=logical_types,
                      semantic_tags=semantic_tags)

    assert isinstance(dt, ww.DataTable)
    assert dt_from_csv.logical_types == dt.logical_types
    assert dt_from_csv.semantic_tags == dt.semantic_tags
    pd.testing.assert_frame_equal(dt_from_csv.to_dataframe(),
                                  dt.to_dataframe())
Beispiel #20
0
def test_new_dt_including(sample_df_pandas):
    # more thorough testing for this exists in indexer testing and new_dt_from_cols testing
    dt = ww.DataTable(sample_df_pandas)
    new_dt = _new_dt_including(dt, sample_df_pandas.iloc[:, 1:4])
    for col in new_dt.columns:
        assert new_dt.semantic_tags[col] == new_dt.semantic_tags[col]
        assert new_dt.logical_types[col] == new_dt.logical_types[col]
Beispiel #21
0
def test_class_imbalance_nonnumeric_balanced(input_type):
    X = pd.DataFrame()
    y_bools_balanced = pd.Series([True, True, True, False, False])
    y_binary_balanced = pd.Series(["No", "Yes", "No", "Yes"])
    y_multiclass_balanced = pd.Series([
        "red", "green", "red", "red", "blue", "green", "red", "blue", "green",
        "red"
    ])
    if input_type == "ww":
        X = ww.DataTable(X)
        y_bools_balanced = ww.DataColumn(y_bools_balanced)
        y_binary_balanced = ww.DataColumn(y_binary_balanced)
        y_multiclass_balanced = ww.DataColumn(y_multiclass_balanced)

    class_imbalance_check = ClassImbalanceDataCheck(num_cv_folds=1)
    assert class_imbalance_check.validate(X, y_multiclass_balanced) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }
    assert class_imbalance_check.validate(X, y_binary_balanced) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }
    assert class_imbalance_check.validate(X, y_multiclass_balanced) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }
Beispiel #22
0
def test_text_featurizer_woodwork_custom_overrides_returned_by_components(
        X_df):
    X_df = X_df.copy()
    X_df['text col'] = pd.Series([
        'this will be a natural language column because length', 'yay', 'hay'
    ],
                                 dtype="string")
    y = pd.Series([1, 2, 1])
    override_types = [Integer, Double, Categorical, Boolean]
    tf = TextFeaturizer()

    for logical_type in override_types:
        try:
            X = ww.DataTable(X_df, logical_types={0: logical_type})
        except TypeError:
            continue

        tf.fit(X)
        transformed = tf.transform(X, y)
        assert isinstance(transformed, ww.DataTable)
        assert transformed.logical_types == {
            0: logical_type,
            'LSA(text col)[0]': Double,
            'LSA(text col)[1]': Double,
            'DIVERSITY_SCORE(text col)': Double,
            'MEAN_CHARACTERS_PER_WORD(text col)': Double,
            'POLARITY_SCORE(text col)': Double
        }
Beispiel #23
0
def test_delay_feature_transformer_supports_custom_index(encode_X_as_str, encode_y_as_str, use_woodwork,
                                                         delayed_features_data):
    X, y = delayed_features_data

    X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, encode_X_as_str, encode_y_as_str)

    X.index = pd.RangeIndex(50, 81)
    X_answer.index = pd.RangeIndex(50, 81)
    y.index = pd.RangeIndex(50, 81)
    y_answer.index = pd.RangeIndex(50, 81)

    answer = pd.DataFrame({"feature": X.feature,
                           "feature_delay_1": X_answer.feature.shift(1),
                           "feature_delay_2": X_answer.feature.shift(2),
                           "feature_delay_3": X_answer.feature.shift(3),
                           "target_delay_0": y_answer,
                           "target_delay_1": y_answer.shift(1),
                           "target_delay_2": y_answer.shift(2),
                           "target_delay_3": y_answer.shift(3)}, index=pd.RangeIndex(50, 81))

    if use_woodwork:
        X = ww.DataTable(X)
        y = ww.DataColumn(y)

    pd.testing.assert_frame_equal(DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X, y), answer)

    answer_only_y = pd.DataFrame({"target_delay_0": y_answer,
                                  "target_delay_1": y_answer.shift(1),
                                  "target_delay_2": y_answer.shift(2),
                                  "target_delay_3": y_answer.shift(3)}, index=pd.RangeIndex(50, 81))
    pd.testing.assert_frame_equal(DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y),
                                  answer_only_y)
def test_datetime_featurizer_woodwork_custom_overrides_returned_by_components(with_datetime_col, encode_as_categories, X_df):
    override_types = [Integer, Double, Categorical, NaturalLanguage, Datetime]
    if with_datetime_col:
        X_df['datetime col'] = pd.to_datetime(['20200101', '20200519', '20190607'], format='%Y%m%d')
    for logical_type in override_types:
        try:
            X = ww.DataTable(X_df.copy(), logical_types={0: logical_type})
        except TypeError:
            continue
        datetime_transformer = DateTimeFeaturizer(encode_as_categories=encode_as_categories)
        datetime_transformer.fit(X)
        transformed = datetime_transformer.transform(X)
        assert isinstance(transformed, ww.DataTable)

        if with_datetime_col:
            if encode_as_categories:
                datetime_col_transformed = {'datetime col_year': Integer, 'datetime col_month': Categorical, 'datetime col_day_of_week': Categorical, 'datetime col_hour': Integer}
            else:
                datetime_col_transformed = {'datetime col_year': Integer, 'datetime col_month': Integer, 'datetime col_day_of_week': Integer, 'datetime col_hour': Integer}
            assert all(item in transformed.logical_types.items() for item in datetime_col_transformed.items())

        if logical_type == Datetime:
            if encode_as_categories:
                col_transformed = {'0_year': Integer, '0_month': Categorical, '0_day_of_week': Categorical, '0_hour': Integer}
            else:
                col_transformed = {'0_year': Integer, '0_month': Integer, '0_day_of_week': Integer, '0_hour': Integer}
            assert all(item in transformed.logical_types.items() for item in col_transformed.items())
        else:
            assert transformed.logical_types[0] == logical_type
Beispiel #25
0
def _retain_custom_types_and_initalize_woodwork(old_datatable,
                                                new_dataframe,
                                                ltypes_to_ignore=None):
    """
    Helper method which will take an old Woodwork DataTable and a new pandas DataFrame and return a
    new DataTable that will try to retain as many logical types from the old DataTable that exist in the new
    pandas DataFrame as possible.

    Arguments:
        old_datatable (ww.DataTable): Woodwork DataTable to use
        new_dataframe (pd.DataFrame): Pandas data structure
        ltypes_to_ignore (list): List of Woodwork logical types to ignore. Columns from the old DataTable that have a logical type
        specified in this list will not have their logical types carried over to the new DataTable returned

    Returns:
        A new DataTable where any of the columns that exist in the old input DataTable and the new DataFrame try to retain
        the original logical type, if possible and not specified to be ignored.
    """
    retained_logical_types = {}
    if ltypes_to_ignore is None:
        ltypes_to_ignore = []
    col_intersection = set(old_datatable.columns).intersection(
        set(new_dataframe.columns))
    logical_types = old_datatable.logical_types
    for col in col_intersection:
        if logical_types[col] in ltypes_to_ignore:
            continue
        if str(new_dataframe[col].dtype) != logical_types[col].pandas_dtype:
            try:
                new_dataframe[col].astype(logical_types[col].pandas_dtype)
                retained_logical_types[col] = old_datatable[col].logical_type
            except (ValueError, TypeError):
                pass
    return ww.DataTable(new_dataframe, logical_types=retained_logical_types)
Beispiel #26
0
def test_explain_predictions_best_worst_custom_metric(mock_make_table,
                                                      output_format, answer):

    mock_make_table.return_value = "table goes here" if output_format == "text" else {
        "explanations": ["explanation_dictionary_goes_here"]
    }
    pipeline = MagicMock()
    pipeline.parameters = "Parameters go here"
    input_features = pd.DataFrame({"a": [5, 6]})
    pipeline.problem_type = ProblemTypes.REGRESSION
    pipeline.name = "Test Pipeline Name"
    pipeline.compute_estimator_features.return_value = ww.DataTable(
        input_features)

    pipeline.predict.return_value = ww.DataColumn(pd.Series([2, 1]))
    y_true = pd.Series([3, 2])

    def sum(y_true, y_pred):
        return y_pred + y_true

    best_worst_report = explain_predictions_best_worst(
        pipeline,
        input_features,
        y_true=y_true,
        num_to_explain=1,
        metric=sum,
        output_format=output_format)

    if output_format == "text":
        compare_two_tables(best_worst_report.splitlines(),
                           regression_custom_metric_answer.splitlines())
    else:
        assert best_worst_report == answer
def test_datetime_nan_check_input_formats():
    dt_nan_check = DateTimeNaNDataCheck()

    # test empty pd.DataFrame
    assert dt_nan_check.validate(pd.DataFrame()) == {
        "warnings": [],
        "errors": [],
        "actions": []
    }

    expected = {
        "warnings": [],
        "actions": [],
        "errors": [
            DataCheckError(
                message=
                'Input datetime column(s) (index) contains NaN values. Please impute NaN values or drop these rows or columns.',
                data_check_name=DateTimeNaNDataCheck.name,
                message_code=DataCheckMessageCode.DATETIME_HAS_NAN,
                details={
                    "columns": 'index'
                }).to_dict()
        ]
    }

    dates = np.arange(np.datetime64('2017-01-01'), np.datetime64('2017-01-08'))
    dates[0] = np.datetime64('NaT')

    #  test Woodwork
    ww_input = ww.DataTable(pd.DataFrame(dates, columns=['index']))
    assert dt_nan_check.validate(ww_input) == expected

    expected = {
        "warnings": [],
        "actions": [],
        "errors": [
            DataCheckError(
                message=
                'Input datetime column(s) (0) contains NaN values. Please impute NaN values or drop these rows or columns.',
                data_check_name=DateTimeNaNDataCheck.name,
                message_code=DataCheckMessageCode.DATETIME_HAS_NAN,
                details={
                    'columns': '0'
                }).to_dict()
        ]
    }

    #  test 2D list
    assert dt_nan_check.validate([
        dates,
        np.arange(np.datetime64('2017-01-01'), np.datetime64('2017-01-08'))
    ]) == expected

    # test np.array
    assert dt_nan_check.validate(
        np.array([
            dates,
            np.arange(np.datetime64('2017-01-01'), np.datetime64('2017-01-08'))
        ])) == expected
Beispiel #28
0
def test_default_data_checks_regression(input_type):
    X = pd.DataFrame({
        'lots_of_null': [None, None, None, None, "some data"],
        'all_null': [None, None, None, None, None],
        'also_all_null': [None, None, None, None, None],
        'no_null': [1, 2, 3, 5, 5],
        'id': [0, 1, 2, 3, 4],
        'has_label_leakage': [100, 200, 100, 200, 100]
    })
    y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
    y_no_variance = pd.Series([5] * 5)

    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        y_no_variance = ww.DataColumn(y_no_variance)
    null_leakage = [
        DataCheckWarning(
            message=
            "Column 'lots_of_null' is 95.0% or more correlated with the target",
            data_check_name="TargetLeakageDataCheck",
            message_code=DataCheckMessageCode.TARGET_LEAKAGE,
            details={
                "column": "lots_of_null"
            }).to_dict()
    ]
    data_checks = DefaultDataChecks(
        "regression", get_default_primary_search_objective("regression"))
    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:]
    }

    # Skip Invalid Target
    assert data_checks.validate(X, y_no_variance) == {
        "warnings":
        messages[:3] + null_leakage,
        "errors":
        messages[4:] + [
            DataCheckError(message="Y has 1 unique value.",
                           data_check_name="NoVarianceDataCheck",
                           message_code=DataCheckMessageCode.NO_VARIANCE,
                           details={
                               "column": "Y"
                           }).to_dict()
        ]
    }

    data_checks = DataChecks(
        DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, {
            "InvalidTargetDataCheck": {
                "problem_type": "regression",
                "objective": get_default_primary_search_objective("regression")
            }
        })
    assert data_checks.validate(X, y) == {
        "warnings": messages[:3],
        "errors": messages[3:]
    }
Beispiel #29
0
def test_default_data_checks_regression(input_type):
    X = pd.DataFrame({'lots_of_null': [None, None, None, None, "some data"],
                      'all_null': [None, None, None, None, None],
                      'also_all_null': [None, None, None, None, None],
                      'no_null': [1, 2, 3, 5, 5],
                      'id': [0, 1, 2, 3, 4],
                      'has_label_leakage': [100, 200, 100, 200, 100],
                      'natural_language_nan': [None,
                                               "string_that_is_long_enough_for_natural_language_1",
                                               "string_that_is_long_enough_for_natural_language_2",
                                               "string_that_is_long_enough_for_natural_language_3",
                                               "string_that_is_long_enough_for_natural_language_4"],
                      'nan_dt_col': pd.Series(pd.date_range('20200101', periods=5))})
    X['nan_dt_col'][0] = None
    y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
    y_no_variance = pd.Series([5] * 5)

    if input_type == "ww":
        X = ww.DataTable(X)
        y = ww.DataColumn(y)
        y_no_variance = ww.DataColumn(y_no_variance)
    null_leakage = [DataCheckWarning(message="Column 'lots_of_null' is 95.0% or more correlated with the target",
                                     data_check_name="TargetLeakageDataCheck",
                                     message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                     details={"column": "lots_of_null"}).to_dict()]
    data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression"))
    id_leakage_warning = [DataCheckWarning(message="Column 'id' is 95.0% or more correlated with the target",
                                           data_check_name="TargetLeakageDataCheck",
                                           message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                           details={"column": "id"}).to_dict()]
    nan_dt_leakage_warning = [DataCheckWarning(message="Column 'nan_dt_col' is 95.0% or more correlated with the target",
                                               data_check_name="TargetLeakageDataCheck",
                                               message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                                               details={"column": "nan_dt_col"}).to_dict()]

    impute_action = DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, 'impute_strategy': 'mean'}).to_dict()
    nan_dt_action = DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'nan_dt_col'}).to_dict()
    expected_actions_with_drop_and_impute = expected_actions[:3] + [nan_dt_action, impute_action] + expected_actions[4:]
    assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning,
                                          "errors": messages[3:],
                                          "actions": expected_actions_with_drop_and_impute}

    # Skip Invalid Target
    assert data_checks.validate(X, y_no_variance) == {
        "warnings": messages[:3] + null_leakage,
        "errors": messages[4:7] + [DataCheckError(message="Y has 1 unique value.",
                                                  data_check_name="NoVarianceDataCheck",
                                                  message_code=DataCheckMessageCode.NO_VARIANCE,
                                                  details={"column": "Y"}).to_dict()] + messages[7:],
        "actions": expected_actions[:3] + expected_actions[4:]
    }

    data_checks = DataChecks(DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES,
                             {"InvalidTargetDataCheck": {"problem_type": "regression",
                                                         "objective": get_default_primary_search_objective("regression")}})
    assert data_checks.validate(X, y) == {"warnings": messages[:3] + id_leakage_warning + nan_dt_leakage_warning,
                                          "errors": messages[3:],
                                          "actions": expected_actions_with_drop_and_impute}
Beispiel #30
0
def test_compute_final_component_features_single_component(mock_transform, X_y_binary):
    X, y = X_y_binary
    X = pd.DataFrame(X)
    mock_transform.return_value = ww.DataTable(X)
    component_graph = ComponentGraph({'Dummy Component': [DummyTransformer]}).instantiate({})
    component_graph.fit(X, y)

    X_t = component_graph.compute_final_component_features(X)
    assert_frame_equal(X, X_t.to_dataframe())