Ejemplo n.º 1
0
def check_samplers_pandas(name, Sampler):
    pd = pytest.importorskip("pandas")
    # Check that the samplers handle pandas dataframe and pandas series
    X, y = make_classification(
        n_samples=1000,
        n_classes=3,
        n_informative=4,
        weights=[0.2, 0.3, 0.5],
        random_state=0,
    )
    X_pd = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
    y_pd = pd.Series(y, name="class")
    sampler = Sampler()
    if isinstance(Sampler(), NearMiss):
        samplers = [Sampler(version=version) for version in (1, 2, 3)]

    else:
        samplers = [Sampler()]

    for sampler in samplers:
        set_random_state(sampler)
        X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y_pd)
        X_res, y_res = sampler.fit_resample(X, y)

        # check that we return a pandas dataframe if a dataframe was given in
        assert isinstance(X_res_pd, pd.DataFrame)
        assert isinstance(y_res_pd, pd.Series)
        assert X_pd.columns.to_list() == X_res_pd.columns.to_list()
        assert y_pd.name == y_res_pd.name
        assert_allclose(X_res_pd.to_numpy(), X_res)
        assert_allclose(y_res_pd.to_numpy(), y_res)
Ejemplo n.º 2
0
def check_methods_have_no_side_effects(Estimator):
    # Check that calling methods has no side effects on args

    if not isclass(Estimator):
        Estimator = type(Estimator)

    estimator = _construct_instance(Estimator)

    set_random_state(estimator)

    # Fit for the first time
    fit_args = _make_args(estimator=estimator, method="fit")
    old_fit_args = deepcopy(fit_args)
    estimator.fit(*fit_args)

    assert deep_equals(
        old_fit_args, fit_args
    ), f"Estimator: {estimator} has side effects on arguments of fit"

    for method in NON_STATE_CHANGING_METHODS:
        if hasattr(estimator, method):
            new_args = _make_args(estimator=estimator, method=method)
            old_args = deepcopy(new_args)
            getattr(estimator, method)(*new_args)

            assert deep_equals(
                old_args, new_args
            ), f"Estimator: {estimator} has side effects on arguments of {method}"
Ejemplo n.º 3
0
def test_meta_estimators_delegate_data_validation(estimator):
    # Check that meta-estimators delegate data validation to the inner
    # estimator(s).
    rng = np.random.RandomState(0)
    set_random_state(estimator)

    n_samples = 30
    X = rng.choice(np.array(["aa", "bb", "cc"], dtype=object), size=n_samples)

    if is_regressor(estimator):
        y = rng.normal(size=n_samples)
    else:
        y = rng.randint(3, size=n_samples)

    # We convert to lists to make sure it works on array-like
    X = _enforce_estimator_tags_x(estimator, X).tolist()
    y = _enforce_estimator_tags_y(estimator, y).tolist()

    # Calling fit should not raise any data validation exception since X is a
    # valid input datastructure for the first step of the pipeline passed as
    # base estimator to the meta estimator.
    estimator.fit(X, y)

    # n_features_in_ should not be defined since data is not tabular data.
    assert not hasattr(estimator, "n_features_in_")
Ejemplo n.º 4
0
def check_supervised_y_2d(name, estimator_orig):
    tags = estimator_orig._get_tags()
    X, y = _create_small_ts_dataset()
    if tags['binary_only']:
        X = X[y != 2]
        y = y[y != 2]

    estimator = clone(estimator_orig)
    set_random_state(estimator)
    # fit
    estimator.fit(X, y)
    y_pred = estimator.predict(X)

    set_random_state(estimator)
    # Check that when a 2D y is given, a DataConversionWarning is
    # raised
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always", DataConversionWarning)
        warnings.simplefilter("ignore", RuntimeWarning)
        estimator.fit(X, y[:, np.newaxis])
    y_pred_2d = estimator.predict(X)
    msg = "expected 1 DataConversionWarning, got: %s" % (", ".join(
        [str(w_x) for w_x in w]))

    if not tags['multioutput'] and name not in ['TimeSeriesSVR']:
        # check that we warned if we don't support multi-output
        assert len(w) > 0, msg
        assert "DataConversionWarning('A column-vector y" \
               " was passed when a 1d array was expected" in msg
        assert_allclose(y_pred.ravel(), y_pred_2d.ravel())
Ejemplo n.º 5
0
def test_persistence_via_pickle(estimator_instance):
    """Check that we can pickle all estimators."""
    estimator = estimator_instance
    set_random_state(estimator)
    fit_args = _make_args(estimator, "fit")
    estimator.fit(*fit_args)

    # Generate results before pickling
    results = dict()
    args = dict()
    for method in NON_STATE_CHANGING_METHODS:
        if hasattr(estimator, method):
            args[method] = _make_args(estimator, method)
            results[method] = getattr(estimator, method)(*args[method])

    # Pickle and unpickle
    pickled_estimator = pickle.dumps(estimator)
    unpickled_estimator = pickle.loads(pickled_estimator)

    # Compare against results after pickling
    for method in results:
        unpickled_result = getattr(unpickled_estimator, method)(*args[method])
        _assert_array_almost_equal(
            results[method],
            unpickled_result,
            decimal=6,
            err_msg="Results are not the same after pickling",
        )
Ejemplo n.º 6
0
    def test_fit_idempotent(self, estimator_instance, scenario):
        """Check that calling fit twice is equivalent to calling it once."""
        estimator = estimator_instance

        # todo: may have to rework this, due to "if estimator has param"
        for method in NON_STATE_CHANGING_METHODS:
            # for now, we have to skip predict_proba, since current output comparison
            #   does not work for tensorflow Distribution
            if (isinstance(estimator_instance, BaseForecaster)
                    and method == "predict_proba"):
                continue
            if _has_capability(estimator, method):
                set_random_state(estimator)
                results = scenario.run(
                    estimator,
                    method_sequence=["fit", method],
                    return_all=True,
                    deepcopy_return=True,
                )

                estimator = results[0]
                set_random_state(estimator)

                results_2nd = scenario.run(
                    estimator,
                    method_sequence=["fit", method],
                    return_all=True,
                    deepcopy_return=True,
                )

                _assert_array_almost_equal(
                    results[1],
                    results_2nd[1],
                    # err_msg=f"Idempotency check failed for method {method}",
                )
Ejemplo n.º 7
0
def check_samplers_sparse(name, Sampler):
    # check that sparse matrices can be passed through the sampler leading to
    # the same results than dense
    X, y = make_classification(
        n_samples=1000,
        n_classes=3,
        n_informative=4,
        weights=[0.2, 0.3, 0.5],
        random_state=0,
    )
    X_sparse = sparse.csr_matrix(X)
    if isinstance(Sampler(), NearMiss):
        samplers = [Sampler(version=version) for version in (1, 2, 3)]
    elif isinstance(Sampler(), ClusterCentroids):
        # set KMeans to full since it support sparse and dense
        samplers = [
            Sampler(
                random_state=0,
                voting="soft",
                estimator=KMeans(random_state=1, algorithm="full"),
            )
        ]
    else:
        samplers = [Sampler()]

    for sampler in samplers:
        set_random_state(sampler)
        X_res_sparse, y_res_sparse = sampler.fit_resample(X_sparse, y)
        X_res, y_res = sampler.fit_resample(X, y)
        assert sparse.issparse(X_res_sparse)
        assert_allclose(X_res_sparse.A, X_res)
        assert_allclose(y_res_sparse, y_res)
Ejemplo n.º 8
0
def check_methods_do_not_change_state(Estimator):
    # Check that methods that are not supposed to change attributes of the
    # estimators do not change anything (including hyper-parameters and
    # fitted parameters)
    estimator = _construct_instance(Estimator)
    set_random_state(estimator)
    fit_args = _make_args(estimator=estimator, method="fit")
    estimator.fit(*fit_args)
    dict_before = estimator.__dict__.copy()

    for method in NON_STATE_CHANGING_METHODS:
        if hasattr(estimator, method):
            args = _make_args(estimator=estimator, method=method)
            getattr(estimator, method)(*args)

            if method == "transform" and Estimator.get_class_tag(
                    "fit-in-transform"):
                # Some transformations fit during transform, as they apply
                # some transformation to each series passed to transform,
                # so transform will actually change the state of these estimator.
                continue

            assert (
                estimator.__dict__ == dict_before
            ), f"Estimator: {estimator} changes __dict__ during {method}"
Ejemplo n.º 9
0
def test_fit_idempotent(estimator_instance, scenario):
    """Check that calling fit twice is equivalent to calling it once."""
    estimator = estimator_instance

    # todo: may have to rework this, due to "if estimator has param"
    for method in NON_STATE_CHANGING_METHODS:
        if _has_capability(estimator, method):
            set_random_state(estimator)
            results = scenario.run(
                estimator,
                method_sequence=["fit", method],
                return_all=True,
                deepcopy_return=True,
            )

            estimator = results[0]
            set_random_state(estimator)

            results_2nd = scenario.run(
                estimator,
                method_sequence=["fit", method],
                return_all=True,
                deepcopy_return=True,
            )

            _assert_array_almost_equal(
                results[1],
                results_2nd[1],
                # err_msg=f"Idempotency check failed for method {method}",
            )
Ejemplo n.º 10
0
def check_samplers_list(name, Sampler):
    # Check that the can samplers handle simple lists
    X, y = make_classification(
        n_samples=1000,
        n_classes=3,
        n_informative=4,
        weights=[0.2, 0.3, 0.5],
        random_state=0,
    )
    X_list = X.tolist()
    y_list = y.tolist()
    sampler = Sampler()
    if isinstance(sampler, NearMiss):
        samplers = [Sampler(version=version) for version in (1, 2, 3)]

    else:
        samplers = [sampler]

    for sampler in samplers:
        set_random_state(sampler)
        X_res, y_res = sampler.fit_resample(X, y)
        X_res_list, y_res_list = sampler.fit_resample(X_list, y_list)

        assert isinstance(X_res_list, list)
        assert isinstance(y_res_list, list)

        assert_allclose(X_res, X_res_list)
        assert_allclose(y_res, y_res_list)
Ejemplo n.º 11
0
def check_pipeline_consistency(name, estimator_orig):
    if estimator_orig._get_tags()['non_deterministic']:
        msg = name + ' is non deterministic'
        raise SkipTest(msg)

    # check that make_pipeline(est) gives same score as est
    X, y = make_blobs(n_samples=30,
                      centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0,
                      n_features=2,
                      cluster_std=0.1)
    X -= X.min()
    X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel)
    estimator = clone(estimator_orig)
    y = multioutput_estimator_convert_y_2d(estimator, y)
    set_random_state(estimator)
    pipeline = make_pipeline(estimator)
    estimator.fit(X, y)
    pipeline.fit(X, y)

    funcs = ["score", "fit_transform"]

    for func_name in funcs:
        func = getattr(estimator, func_name, None)
        if func is not None:
            func_pipeline = getattr(pipeline, func_name)
            result = func(X, y)
            result_pipe = func_pipeline(X, y)
            assert_allclose_dense_sparse(result, result_pipe)
Ejemplo n.º 12
0
    def test_methods_have_no_side_effects(self, estimator_instance, scenario):
        """Check that calling methods has no side effects on args."""
        estimator = estimator_instance

        set_random_state(estimator)

        # Fit the model, get args before and after
        _, args_after = scenario.run(estimator,
                                     method_sequence=["fit"],
                                     return_args=True)
        fit_args_after = args_after[0]
        fit_args_before = scenario.args["fit"]

        assert deep_equals(
            fit_args_before, fit_args_after
        ), f"Estimator: {estimator} has side effects on arguments of fit"

        for method in NON_STATE_CHANGING_METHODS:
            if _has_capability(estimator, method):
                # Fit the model, get args before and after
                _, args_after = scenario.run(estimator,
                                             method_sequence=[method],
                                             return_args=True)
                method_args_after = args_after[0]
                method_args_before = scenario.get_args(method, estimator)

                assert deep_equals(
                    method_args_after, method_args_before
                ), f"Estimator: {estimator} has side effects on arguments of {method}"
Ejemplo n.º 13
0
    def test_methods_do_not_change_state(self, estimator_instance, scenario):
        """Check that non-state-changing methods do not change state.

        Check that methods that are not supposed to change attributes of the
        estimators do not change anything (including hyper-parameters and
        fitted parameters)
        """
        estimator = estimator_instance
        set_random_state(estimator)

        for method in NON_STATE_CHANGING_METHODS:
            if _has_capability(estimator, method):

                # dict_before = copy of dictionary of estimator before predict, post fit
                _ = scenario.run(estimator, method_sequence=["fit"])
                dict_before = estimator.__dict__.copy()

                # dict_after = dictionary of estimator after predict and fit
                _ = scenario.run(estimator, method_sequence=[method])
                dict_after = estimator.__dict__

                is_equal, msg = deep_equals(dict_after,
                                            dict_before,
                                            return_msg=True)
                assert is_equal, (
                    f"Estimator: {type(estimator).__name__} changes __dict__ "
                    f"during {method}, "
                    f"reason/location of discrepancy (x=after, y=before): {msg}"
                )
Ejemplo n.º 14
0
    def test_multiprocessing_idempotent(self, estimator_instance, scenario):
        """Test that single and multi-process run results are identical.

        Check that running an estimator on a single process is no different to running
        it on multiple processes. We also check that we can set n_jobs=-1 to make use
        of all CPUs. The test is not really necessary though, as we rely on joblib for
        parallelization and can trust that it works as expected.
        """
        params = estimator_instance.get_params()

        if "n_jobs" in params:
            for method in NON_STATE_CHANGING_METHODS:
                if _has_capability(estimator_instance, method):
                    # run on a single process
                    # -----------------------
                    estimator = deepcopy(estimator_instance)
                    estimator.set_params(n_jobs=1)
                    set_random_state(estimator)
                    result_single_process = scenario.run(
                        estimator, method_sequence=["fit", method])

                    # run on multiple processes
                    # -------------------------
                    estimator = deepcopy(estimator_instance)
                    estimator.set_params(n_jobs=-1)
                    set_random_state(estimator)
                    result_multiple_process = scenario.run(
                        estimator, method_sequence=["fit", method])
                    _assert_array_equal(
                        result_single_process,
                        result_multiple_process,
                        err_msg=
                        "Results are not equal for n_jobs=1 and n_jobs=-1",
                    )
Ejemplo n.º 15
0
def test_fit_does_not_overwrite_hyper_params(estimator_instance):
    """Check that we do not overwrite hyper-parameters in fit."""
    estimator = estimator_instance
    set_random_state(estimator)

    # Make a physical copy of the original estimator parameters before fitting.
    params = estimator.get_params()
    original_params = deepcopy(params)

    # Fit the model
    fit_args = _make_args(estimator, "fit")
    estimator.fit(*fit_args)

    # Compare the state of the model parameters with the original parameters
    new_params = estimator.get_params()
    for param_name, original_value in original_params.items():
        new_value = new_params[param_name]

        # We should never change or mutate the internal state of input
        # parameters by default. To check this we use the joblib.hash function
        # that introspects recursively any subobjects to compute a checksum.
        # The only exception to this rule of immutable constructor parameters
        # is possible RandomState instance but in this check we explicitly
        # fixed the random_state params recursively to be integer seeds.
        assert joblib.hash(new_value) == joblib.hash(original_value), (
            "Estimator %s should not change or mutate "
            " the parameter %s from %s to %s during fit." %
            (estimator.__class__.__name__, param_name, original_value,
             new_value))
Ejemplo n.º 16
0
def test_fit_idempotent(estimator_instance):
    """Check that calling fit twice is equivalent to calling it once."""
    estimator = estimator_instance

    set_random_state(estimator)

    # Fit for the first time
    fit_args = _make_args(estimator, "fit")
    estimator.fit(*fit_args)

    results = dict()
    args = dict()
    for method in NON_STATE_CHANGING_METHODS:
        if hasattr(estimator, method):
            args[method] = _make_args(estimator, method)
            results[method] = getattr(estimator, method)(*args[method])

    # Fit again
    set_random_state(estimator)
    estimator.fit(*fit_args)

    for method in NON_STATE_CHANGING_METHODS:
        if hasattr(estimator, method):
            new_result = getattr(estimator, method)(*args[method])
            _assert_array_almost_equal(
                results[method],
                new_result,
                # err_msg=f"Idempotency check failed for method {method}",
            )
Ejemplo n.º 17
0
def test_set_random_state():
    lda = LinearDiscriminantAnalysis()
    tree = DecisionTreeClassifier()
    # Linear Discriminant Analysis doesn't have random state: smoke test
    set_random_state(lda, 3)
    set_random_state(tree, 3)
    assert tree.random_state == 3
Ejemplo n.º 18
0
def test_methods_do_not_change_state(estimator_instance):
    """Check that non-state-changing methods do not change state.

    Check that methods that are not supposed to change attributes of the
    estimators do not change anything (including hyper-parameters and
    fitted parameters)
    """
    estimator = estimator_instance
    set_random_state(estimator)

    fit_args = _make_args(estimator, "fit")
    estimator.fit(*fit_args)
    dict_before = estimator.__dict__.copy()

    for method in NON_STATE_CHANGING_METHODS:
        if hasattr(estimator, method):
            args = _make_args(estimator, method)
            getattr(estimator, method)(*args)

            if method == "transform" and estimator.get_class_tag("fit-in-transform"):
                # Some transformations fit during transform, as they apply
                # some transformation to each series passed to transform,
                # so transform will actually change the state of these estimator.
                continue

            if method == "predict" and estimator.get_class_tag("fit-in-predict"):
                # Some annotators fit during predict, as they apply
                # some apply annotation to each series passed to predict,
                # so predict will actually change the state of these annotators.
                continue

            assert (
                estimator.__dict__ == dict_before
            ), f"Estimator: {estimator} changes __dict__ during {method}"
Ejemplo n.º 19
0
def check_non_transf_est_n_iter(name, estimator_orig):
    # Test that estimators that are not transformers with a parameter
    # max_iter, return the attribute of n_iter_ at least 1.
    estimator = clone(estimator_orig)
    if hasattr(estimator, 'max_iter'):
        X, y = _create_small_ts_dataset()
        set_random_state(estimator, 0)
        estimator.fit(X, y)
        assert estimator.n_iter_ >= 1
Ejemplo n.º 20
0
def _generate_search_cv_instances():
    for SearchCV, (Estimator, param_grid) in product(
        [
            GridSearchCV,
            HalvingGridSearchCV,
            RandomizedSearchCV,
            HalvingGridSearchCV,
        ],
        [
            (Ridge, {
                "alpha": [0.1, 1.0]
            }),
            (LogisticRegression, {
                "C": [0.1, 1.0]
            }),
        ],
    ):
        init_params = signature(SearchCV).parameters
        extra_params = ({
            "min_resources": "smallest"
        } if "min_resources" in init_params else {})
        search_cv = SearchCV(Estimator(), param_grid, cv=2, **extra_params)
        set_random_state(search_cv)
        yield search_cv

    for SearchCV, (Estimator, param_grid) in product(
        [
            GridSearchCV,
            HalvingGridSearchCV,
            RandomizedSearchCV,
            HalvingRandomSearchCV,
        ],
        [
            (Ridge, {
                "ridge__alpha": [0.1, 1.0]
            }),
            (LogisticRegression, {
                "logisticregression__C": [0.1, 1.0]
            }),
        ],
    ):
        init_params = signature(SearchCV).parameters
        extra_params = ({
            "min_resources": "smallest"
        } if "min_resources" in init_params else {})
        search_cv = SearchCV(make_pipeline(PCA(), Estimator()),
                             param_grid,
                             cv=2,
                             **extra_params).set_params(error_score="raise")
        set_random_state(search_cv)
        yield search_cv
Ejemplo n.º 21
0
def _tested_estimators():
    for name, Estimator in all_estimators():
        try:
            estimator = _construct_instance(Estimator)
            set_random_state(estimator)
        except SkipTest:
            continue

        if isinstance(estimator, NearMiss):
            # For NearMiss, let's check the three algorithms
            for version in (1, 2, 3):
                yield clone(estimator).set_params(version=version)
        else:
            yield estimator
Ejemplo n.º 22
0
def check_target_type(name, Estimator):
    # should raise warning if the target is continuous (we cannot raise error)
    X = np.random.random((20, 2))
    y = np.linspace(0, 1, 20)
    estimator = Estimator()
    set_random_state(estimator)
    with pytest.raises(ValueError, match="Unknown label type: 'continuous'"):
        estimator.fit_resample(X, y)
    # if the target is multilabel then we should raise an error
    rng = np.random.RandomState(42)
    y = rng.randint(2, size=(20, 3))
    msg = "Multilabel and multioutput targets are not supported."
    with pytest.raises(ValueError, match=msg):
        estimator.fit_resample(X, y)
Ejemplo n.º 23
0
def check_samplers_preserve_dtype(name, Sampler):
    X, y = make_classification(
        n_samples=1000,
        n_classes=3,
        n_informative=4,
        weights=[0.2, 0.3, 0.5],
        random_state=0,
    )
    # Cast X and y to not default dtype
    X = X.astype(np.float32)
    y = y.astype(np.int32)
    sampler = Sampler()
    set_random_state(sampler)
    X_res, y_res = sampler.fit_resample(X, y)
    assert X.dtype == X_res.dtype, "X dtype is not preserved"
    assert y.dtype == y_res.dtype, "y dtype is not preserved"
Ejemplo n.º 24
0
def check_clustering(name, clusterer_orig, readonly_memmap=False):

    clusterer = clone(clusterer_orig)
    X, y = _create_small_ts_dataset()
    X, y = shuffle(X, y, random_state=7)
    X = TimeSeriesScalerMeanVariance().fit_transform(X)
    rng = np.random.RandomState(42)
    X_noise = X + (rng.randn(*X.shape) / 5)

    n_samples, n_features, dim = X.shape
    # catch deprecation and neighbors warnings
    if hasattr(clusterer, "n_clusters"):
        clusterer.set_params(n_clusters=3)
    set_random_state(clusterer)

    # fit
    clusterer.fit(X)
    # with lists
    clusterer.fit(X.tolist())

    pred = clusterer.labels_
    assert_equal(pred.shape, (n_samples, ))
    assert_greater(adjusted_rand_score(pred, y), 0.4)

    if clusterer._get_tags()['non_deterministic']:
        return

    set_random_state(clusterer)
    with warnings.catch_warnings(record=True):
        pred2 = clusterer.fit_predict(X)
    assert_array_equal(pred, pred2)

    # fit_predict(X) and labels_ should be of type int
    assert pred.dtype in [np.dtype('int32'), np.dtype('int64')]
    assert pred2.dtype in [np.dtype('int32'), np.dtype('int64')]

    # Add noise to X to test the possible values of the labels
    labels = clusterer.fit_predict(X_noise)

    # There should be at least one sample in every original cluster
    labels_sorted = np.unique(labels)
    assert_array_equal(labels_sorted, np.arange(0, 3))

    # Labels should be less than n_clusters - 1
    if hasattr(clusterer, 'n_clusters'):
        n_clusters = getattr(clusterer, 'n_clusters')
        assert_greater_equal(n_clusters - 1, labels_sorted[-1])
Ejemplo n.º 25
0
def check_samplers_multiclass_ova(name, Sampler):
    # Check that multiclass target lead to the same results than OVA encoding
    X, y = make_classification(
        n_samples=1000,
        n_classes=3,
        n_informative=4,
        weights=[0.2, 0.3, 0.5],
        random_state=0,
    )
    y_ova = label_binarize(y, np.unique(y))
    sampler = Sampler()
    set_random_state(sampler)
    X_res, y_res = sampler.fit_resample(X, y)
    X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova)
    assert_allclose(X_res, X_res_ova)
    assert type_of_target(y_res_ova) == type_of_target(y_ova)
    assert_allclose(y_res, y_res_ova.argmax(axis=1))
Ejemplo n.º 26
0
def test_multiprocessing_idempotent(estimator_class):
    """Test that single and multi-process run results are identical.

    Check that running an estimator on a single process is no different to running
    it on multiple processes. We also check that we can set n_jobs=-1 to make use
    of all CPUs. The test is not really necessary though, as we rely on joblib for
    parallelization and can trust that it works as expected.
    """
    estimator = estimator_class.create_test_instance()
    params = estimator.get_params()

    if "n_jobs" in params:
        results = dict()
        args = dict()

        # run on a single process
        estimator = estimator_class.create_test_instance()
        estimator.set_params(n_jobs=1)
        set_random_state(estimator)
        args["fit"] = _make_args(estimator, "fit")
        estimator.fit(*args["fit"])

        # compute and store results
        for method in NON_STATE_CHANGING_METHODS:
            if hasattr(estimator, method):
                args[method] = _make_args(estimator, method)
                results[method] = getattr(estimator, method)(*args[method])

        # run on multiple processes, reusing the same input arguments
        estimator = estimator_class.create_test_instance()
        estimator.set_params(n_jobs=-1)
        set_random_state(estimator)
        estimator.fit(*args["fit"])

        # compute and compare results
        for method in results:
            if hasattr(estimator, method):
                result = getattr(estimator, method)(*args[method])
                _assert_array_equal(
                    results[method],
                    result,
                    err_msg="Results are not equal for n_jobs=1 and n_jobs=-1",
                )
Ejemplo n.º 27
0
def test_methods_do_not_change_state(estimator_instance, scenario):
    """Check that non-state-changing methods do not change state.

    Check that methods that are not supposed to change attributes of the
    estimators do not change anything (including hyper-parameters and
    fitted parameters)
    """
    estimator = estimator_instance
    set_random_state(estimator)

    for method in NON_STATE_CHANGING_METHODS:
        if _has_capability(estimator, method):

            # dict_before = copy of dictionary of estimator before predict, after fit
            _ = scenario.run(estimator, method_sequence=["fit"])
            dict_before = estimator.__dict__.copy()

            # dict_after = dictionary of estimator after predict and fit
            _ = scenario.run(estimator, method_sequence=[method])
            dict_after = estimator.__dict__

            if method == "transform" and estimator.get_class_tag("fit-in-transform"):
                # Some transformations fit during transform, as they apply
                # some transformation to each series passed to transform,
                # so transform will actually change the state of these estimator.
                continue

            if method == "predict" and estimator.get_class_tag("fit-in-predict"):
                # Some annotators fit during predict, as they apply
                # some apply annotation to each series passed to predict,
                # so predict will actually change the state of these annotators.
                continue

            # old logic uses equality without auto-msg, keep comment until refactor
            # is_equal = dict_after == dict_before
            is_equal, msg = deep_equals(dict_after, dict_before, return_msg=True)
            assert is_equal, (
                f"Estimator: {type(estimator).__name__} changes __dict__ "
                f"during {method}, "
                f"reason/location of discrepancy (x=after, y=before): {msg}"
            )
Ejemplo n.º 28
0
def check_samplers_pandas(name, Sampler):
    pd = pytest.importorskip("pandas")
    # Check that the samplers handle pandas dataframe and pandas series
    X, y = make_classification(
        n_samples=1000,
        n_classes=3,
        n_informative=4,
        weights=[0.2, 0.3, 0.5],
        random_state=0,
    )
    X_pd = pd.DataFrame(X)
    sampler = Sampler()
    if isinstance(Sampler(), NearMiss):
        samplers = [Sampler(version=version) for version in (1, 2, 3)]

    else:
        samplers = [Sampler()]

    for sampler in samplers:
        set_random_state(sampler)
        X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y)
        X_res, y_res = sampler.fit_resample(X, y)
        assert_allclose(X_res_pd, X_res)
        assert_allclose(y_res_pd, y_res)
Ejemplo n.º 29
0
    def test_persistence_via_pickle(self, estimator_instance, scenario):
        """Check that we can pickle all estimators."""
        estimator = estimator_instance
        set_random_state(estimator)
        # Fit the model, get args before and after
        scenario.run(estimator, method_sequence=["fit"], return_args=True)

        # Generate results before pickling
        results = {}
        for method in NON_STATE_CHANGING_METHODS:
            if _has_capability(estimator, method):
                results[method] = scenario.run(estimator,
                                               method_sequence=[method])

        # Pickle and unpickle
        pickled_estimator = pickle.dumps(estimator)
        unpickled_estimator = pickle.loads(pickled_estimator)

        # Compare against results after pickling
        for method, vanilla_result in results.items():
            # escape predict_proba for forecasters, tfp distributions cannot be pickled
            if (isinstance(estimator_instance, BaseForecaster)
                    and method == "predict_proba"):
                continue
            unpickled_result = scenario.run(unpickled_estimator,
                                            method_sequence=[method])

            msg = (
                f"Results of {method} differ between when pickling and not pickling, "
                f"estimator {type(estimator_instance).__name__}")
            _assert_array_almost_equal(
                vanilla_result,
                unpickled_result,
                decimal=6,
                err_msg=msg,
            )
Ejemplo n.º 30
0
def test_methods_have_no_side_effects(estimator_instance):
    """Check that calling methods has no side effects on args."""
    estimator = estimator_instance

    set_random_state(estimator)

    # Fit for the first time
    fit_args = _make_args(estimator, "fit")
    old_fit_args = deepcopy(fit_args)
    estimator.fit(*fit_args)

    assert deep_equals(
        old_fit_args, fit_args
    ), f"Estimator: {estimator} has side effects on arguments of fit"

    for method in NON_STATE_CHANGING_METHODS:
        if hasattr(estimator, method):
            new_args = _make_args(estimator, method)
            old_args = deepcopy(new_args)
            getattr(estimator, method)(*new_args)

            assert deep_equals(
                old_args, new_args
            ), f"Estimator: {estimator} has side effects on arguments of {method}"