def test_sparse_df_int(self):
     Xsparse_int = convert_generic(
         X, output_type=DataTypes.SparseArray).astype(np.int32)
     Xsparse_df_int = convert_generic(Xsparse_int,
                                      output_type=DataTypes.SparseDataFrame)
     lgbm = LGBMClassifier()
     lgbm.fit(Xsparse_df_int, y)
Beispiel #2
0
def verif_TruncatedSVDWrapperSparseData(use_wrapper):
    if use_wrapper:
        klass = TruncatedSVDWrapper
    else:
        klass = TruncatedSVD

    np.random.seed(123)
    X1 = np.random.randn(50, 10)
    df1 = convert_generic(X1, output_type=DataTypes.SparseDataFrame)

    # ok : array
    svd = klass(n_components=2)
    svd.fit(X1)

    # ok : dataframe with sparse float
    svd = klass(n_components=2)
    svd.fit(df1)

    # ok : dataframe with sparse int
    X2 = np.random.randint(0, 10, (50, 10))
    df2 = convert_generic(X2, output_type=DataTypes.SparseDataFrame)
    svd = klass(n_components=2)
    svd.fit(df2)

    # fails : mix sparse int and sparse float
    df = pd.concat((df1, df2), axis=1)
    df.columns = list(range(df.shape[1]))
    svd = klass(n_components=2)
    svd.fit(df)
Beispiel #3
0
def check_all_numerical(df_transformed, df=None):
    mat = convert_generic(df_transformed, output_type=DataTypes.NumpyArray)
    has_error = False
    try:
        mat.astype(np.float64)
    except ValueError:
        has_error = True

    assert not has_error  # So that I explicty raise an assertion
def test_approx_cross_validation_transformer(x_data_type, shuffle, graph_pipeline, with_groups):

    if graph_pipeline:
        estimator = GraphPipeline({"ptA": DebugPassThrough(), "ptB": DebugPassThrough()}, edges=[("ptA", "ptB")])
    else:
        estimator = DebugPassThrough()

    X, y = make_classification(n_samples=100, random_state=123)
    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    X = convert_generic(X, output_type=x_data_type)
    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if shuffle:
        np.random.seed(123)
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)
        y = y[ii]

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    scoring = ["accuracy", "neg_log_loss"]

    ##################
    ### Score only ###
    ##################
    with pytest.raises(Exception):
        cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0)
        # shouldn't work since DebugPassThrough can't be scored

    #################
    ### Transform ###
    #################
    cv_res, Xhat = cross_validation(
        estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, no_scoring=True
    )

    assert type(Xhat) == type(X)
    assert cv_res is None
    assert Xhat.shape == X.shape

    if isinstance(X, pd.DataFrame):
        assert (Xhat.index == X.index).all()
        assert (Xhat.columns == X.columns).all()

    if isinstance(X, pd.DataFrame):
        assert np.abs(Xhat - X).max().max() <= 10 ** (10 - 10)
    else:
        assert np.max(np.abs(Xhat - X)) <= 10 ** (-10)
Beispiel #5
0
def test_convert_to_array_when_sparse():

    array = np.zeros((10, 3), dtype=np.float32)
    df1 = pd.DataFrame(array, columns=["A", "B", "C"])
    df1_sparse = convert_generic(df1, output_type=DataTypes.SparseDataFrame)
    df1_array = convert_generic(df1_sparse, output_type=DataTypes.NumpyArray)
    assert (df1_array == array).all()
    assert df1_array.dtype == array.dtype

    df1 = pd.DataFrame({
        "int_col": np.zeros(10, dtype=np.int64),
        "float_col": np.zeros(10, dtype=np.float64)
    })

    df1_sparse = convert_generic(df1, output_type=DataTypes.SparseDataFrame)
    df1_array = convert_generic(df1_sparse, output_type=DataTypes.NumpyArray)

    assert (df1_array == 0).all()
    assert df1_array.dtype == np.float64
def _convert_sparse(x, sparse):
    if isinstance(x, pd.Series):
        if _IS_PD1 and sparse:
            return convert_to_sparseserie(x)
        else:
            return x  # nothing, I don't want to test sparse
    elif isinstance(x, pd.DataFrame):
        return convert_generic(x, output_type=DataTypes.SparseDataFrame)

    else:
        TypeError("This function is for DataFrame or Serie")
Beispiel #7
0
    def _prepare_target(self, y, klass, conversion_type):
        """ prepare the target so that it can be given to the underlying model to use
        
        Parameters
        ----------
        
        y : array
            the original target 
            
        klass : type
            the encoder to use for the target
            
        conversion_type : DataType
            the output type desired by the target
            
        Set
        ---
        self._mono_target : bool 
            does the original problem as one target or not
        self._target_encoded : the encoder used on the target
        
        Returns
        --------
        y_encoded : array
            the modified target
        """
        self._mono_target = y.ndim == 1
        self._target_dtype = y.dtype

        if isinstance(self.classes, str) and self.classes == "auto":
            categories = "auto"
        else:
            if self._mono_target:
                categories = [self.classes
                              ]  # because OrdinalEncoder expect a list
            else:
                if not isinstance(self.classes, list):
                    raise TypeError(
                        "For multi-target classes should be a list, instead I got %s"
                        % str(type(self.classes)))

                categories = self.classes

        self._target_encoder = klass(categories=categories, dtype=np.int32)

        yd2 = convert_generic(make2dimensions(y), output_type=conversion_type)

        if conversion_type == DataTypes.NumpyArray and yd2.dtype.kind == 'U':
            yd2 = yd2.astype(np.object, copy=False)

        y_encoded = self._target_encoder.fit_transform(yd2)

        return y_encoded
Beispiel #8
0
    def fit(self, X, y):

        if not is_classifier(self.classifier):
            raise TypeError("classifier should be a sklearn classifier")

        y_int = self._prepare_target(y,
                                     klass=_OrdinalOneHotEncoder,
                                     conversion_type=DataTypes.DataFrame)
        y_int = convert_generic(y_int, output_type=DataTypes.NumpyArray)

        self.classifier.fit(X, y_int)

        return self
Beispiel #9
0
    def transform(self, X):

        if not self._already_fitted:
            raise NotFittedError(
                "This %s instance is not fitted yet. Call 'fit' with "
                "appropriate arguments before using this method." %
                type(self).__name__)

        if self._is_classifier:
            predictions = self.model.predict_proba(X)
        else:
            predictions = self.model.predict(X)

        res = self._format_predictions(predictions,
                                       is_classifier=self._is_classifier,
                                       target_info=self._target_info)

        res = convert_generic(res, output_type=self.desired_output_type)

        if hasattr(res, "columns"):
            res.columns = self.get_feature_names()

        return res
 def test_sparse_df(self):
     Xsparse_df = convert_generic(X, output_type=DataTypes.SparseDataFrame)
     lgbm = LGBMClassifier()
     lgbm.fit(Xsparse_df, y)
 def test_sparse(self):
     Xsparse = convert_generic(X, output_type=DataTypes.SparseArray)
     lgbm = LGBMClassifier()
     lgbm.fit(Xsparse, y)
Beispiel #12
0
def verif_model(df1, df2, y1, klass, model_kwargs, all_types, is_classifier):
    """ helper function that check (using asserts) a bunch a thing on a model klass
    
    Parameters
    ----------
    
    df1 : array like
        data on which model will be trained
    
    df2 : array like
        data on which model will be tested
        
    klass : type
        type of the model to test
        
    model_kwargs : dict
        kwargs to be passed to klass to create a model
        
    all_types : list of type
        list of input type to test the models on
        
    is_classifier : boolean
        if True the model is a Classifier otherwise a Regressor
   
    
    """

    if not isinstance(all_types, (list, tuple)):
        all_types = (all_types, )

    model0 = klass(**model_kwargs)  # Create an object ...
    model1 = clone(model0)  # then try to clone it

    model2 = klass()  # Create an empty object and then set its params
    model2.set_params(**model_kwargs)

    # Verify type are iddentical
    assert type(model0) == type(model1)
    assert type(model0) == type(model2)

    assert hasattr(klass, "fit")
    assert hasattr(klass, "predict")
    if is_classifier:
        assert hasattr(klass, "predict_proba")

    # Verify get_params are identical
    params_0 = model0.get_params()
    params_1 = model1.get_params()
    params_2 = model2.get_params()

    rec_assert_equal(params_0, params_1)
    rec_assert_equal(params_0, params_2)

    rec_assert_equal({k: v
                      for k, v in params_0.items() if k in model_kwargs},
                     model_kwargs)
    rec_assert_equal({k: v
                      for k, v in params_1.items() if k in model_kwargs},
                     model_kwargs)
    rec_assert_equal({k: v
                      for k, v in params_2.items() if k in model_kwargs},
                     model_kwargs)

    extended_all_types = extend_all_type(all_types)

    if is_classifier:
        yclasses = list(set(np.unique(y1)))
        nb_classes = len(yclasses)

    for fit_type, additional_conversion_fun in extended_all_types:

        # Convert inputs into several type ..
        df1_conv = convert_generic(df1, output_type=fit_type)
        df2_conv = convert_generic(df2, output_type=fit_type)

        if additional_conversion_fun is not None:
            df1_conv = additional_conversion_fun(df1_conv)
            df2_conv = additional_conversion_fun(df2_conv)

        model_a = klass(**model_kwargs)
        model_a.fit(df1_conv, y=y1)

        y1_hat_a = model_a.predict(
            df1_conv)  # Other test with an y (might be None or not)
        y2_hat_a = model_a.predict(df2_conv)

        if is_classifier:
            y1_hatproba_a = model_a.predict_proba(df1_conv)
            y2_hatproba_a = model_a.predict_proba(df2_conv)

        params_3 = model_a.get_params(
        )  # Verif that get_params didn't change after fit
        # Rmk : might no be enforce ON all transformeurs

        rec_assert_equal(params_0, params_3)

        assert y1_hat_a is not None  # verify that something was created
        assert y2_hat_a is not None  # verify that something was created

        model_cloned = clone(model_a)  # Clone again ...
        assert_raise_not_fitted(
            model_cloned, df2_conv
        )  # ... and verify that the clone isn't fitted, even if model_a is fitted

        # Same thing but using clone
        model_b = clone(model_a)
        model_b.fit(df1_conv, y=y1)

        y1_hat_b = model_b.predict(df1_conv)
        y2_hat_b = model_b.predict(df2_conv)
        if is_classifier:
            y1_hatproba_b = model_b.predict_proba(df1_conv)
            y2_hatproba_b = model_b.predict_proba(df2_conv)

        # Same thing but with set_params
        model_c = klass()
        model_c.set_params(**model_kwargs)
        model_c.fit(df1_conv, y=y1)

        y1_hat_c = model_c.predict(df1_conv)
        y2_hat_c = model_c.predict(df2_conv)

        if is_classifier:
            y1_hatproba_c = model_c.predict_proba(df1_conv)
            y2_hatproba_c = model_c.predict_proba(df2_conv)

        # check error when call with too few columns
        assert_raise_value_error(model_a, gen_slice(df1_conv, slice(1, None)))
        assert_raise_value_error(model_b, gen_slice(df1_conv, slice(1, None)))
        assert_raise_value_error(model_c, gen_slice(df1_conv, slice(1, None)))

        assert y1_hat_a.shape[0] == df1_conv.shape[0]
        assert y1_hat_b.shape[0] == df1_conv.shape[0]
        assert y1_hat_c.shape[0] == df1_conv.shape[0]

        assert y2_hat_a.shape[0] == df2_conv.shape[0]
        assert y2_hat_b.shape[0] == df2_conv.shape[0]
        assert y2_hat_c.shape[0] == df2_conv.shape[0]

        assert y1_hat_a.ndim == y1.ndim
        assert y1_hat_b.ndim == y1.ndim
        assert y1_hat_c.ndim == y1.ndim

        assert y2_hat_a.ndim == y1.ndim
        assert y2_hat_b.ndim == y1.ndim
        assert y2_hat_c.ndim == y1.ndim

        if is_classifier:
            assert y1_hatproba_a.ndim == 2
            assert y1_hatproba_b.ndim == 2
            assert y1_hatproba_c.ndim == 2
            assert y2_hatproba_a.ndim == 2
            assert y2_hatproba_b.ndim == 2
            assert y2_hatproba_c.ndim == 2

            y1_hatproba_a.shape[1] == nb_classes
            y1_hatproba_b.shape[1] == nb_classes
            y1_hatproba_c.shape[1] == nb_classes

            y2_hatproba_a.shape[1] == nb_classes
            y2_hatproba_b.shape[1] == nb_classes
            y2_hatproba_c.shape[1] == nb_classes

            assert hasattr(model_a, "classes_")
            assert hasattr(model_b, "classes_")
            assert hasattr(model_c, "classes_")

            assert list(set(model_a.classes_)) == list(set(yclasses))
            assert list(set(model_b.classes_)) == list(set(yclasses))
            assert list(set(model_c.classes_)) == list(set(yclasses))

            for f in (check_all_numerical, check_between_01, check_no_null):

                f(y1_hatproba_a)
                f(y1_hatproba_b)
                f(y1_hatproba_c)

                f(y2_hatproba_a)
                f(y2_hatproba_b)
                f(y2_hatproba_c)

        # Verif type
        assert get_type(y1_hat_b) == get_type(y1_hat_a)
        assert get_type(y1_hat_c) == get_type(y1_hat_a)
        assert get_type(y2_hat_a) == get_type(y1_hat_a)
        assert get_type(y2_hat_b) == get_type(y1_hat_a)
        assert get_type(y2_hat_c) == get_type(y1_hat_a)
Beispiel #13
0
def check_between_01(df_transformed, df=None):
    xx = convert_generic(df_transformed, output_type=DataTypes.NumpyArray)
    assert xx.min() >= 0
    assert xx.max() <= 1
Beispiel #14
0
def verif_model(df1, df2, y1, klass, enc_kwargs, all_types):
    """ function to test differents things on a model """

    is_multiple_output = y1.ndim > 1 and y1.shape[1] >= 1
    nb_outputs = 1
    if is_multiple_output:
        nb_outputs = y1.shape[1]

    if not isinstance(all_types, (list, tuple)):
        all_types = (all_types, )

    # all_types = (DataTypes.DataFrame, DataTypes.SparseDataFrame)
    assert hasattr(klass, "fit")
    assert hasattr(klass, "predict")

    encoder0 = klass(**enc_kwargs)  # Create an object ...
    encoder1 = clone(encoder0)  # then try to clone it

    encoder2 = klass()  # Create an empty object and then set its params
    encoder2.set_params(**enc_kwargs)

    if is_classifier(encoder0):
        model_is_classifier = True
        assert hasattr(klass, "predict_proba")
    else:
        model_is_classifier = False
        assert is_regressor(encoder0)

    assert_raise_not_fitted(encoder0, df1)
    assert_raise_not_fitted(encoder1, df1)
    assert_raise_not_fitted(encoder2, df1)

    # Verify type are iddentical
    assert type(encoder0) == type(encoder1)
    assert type(encoder0) == type(encoder2)

    # Verify get_params are identical
    params_0 = encoder0.get_params()
    params_1 = encoder1.get_params()
    params_2 = encoder2.get_params()

    rec_assert_equal(params_0, params_1)
    rec_assert_equal(params_0, params_2)

    rec_assert_equal({k: v
                      for k, v in params_0.items() if k in enc_kwargs},
                     enc_kwargs)
    rec_assert_equal({k: v
                      for k, v in params_1.items() if k in enc_kwargs},
                     enc_kwargs)
    rec_assert_equal({k: v
                      for k, v in params_2.items() if k in enc_kwargs},
                     enc_kwargs)

    extended_all_types = extend_all_type(all_types)

    def get_values(y):
        if hasattr(y, "values"):
            return y.values
        else:
            return y

    y1_np = get_values(y1)

    for fit_type, additional_conversion_fun in extended_all_types:

        # Convert inputs into several type ..
        df1_conv = convert_generic(df1, output_type=fit_type)
        df2_conv = convert_generic(df2, output_type=fit_type)

        if additional_conversion_fun is not None:
            df1_conv = additional_conversion_fun(df1_conv)
            df2_conv = additional_conversion_fun(df2_conv)

        encoder_a = klass(**enc_kwargs)
        y1_hat_a = encoder_a.fit(df1_conv, y=y1).predict(
            df1_conv)  # Other test with an y (might be None or not)
        y2_hat_a = encoder_a.predict(df2_conv)

        if model_is_classifier:
            y1_hat_proba_a = encoder_a.predict_proba(df1_conv)
            y2_hat_proba_a = encoder_a.predict_proba(df2_conv)
            if is_multiple_output:
                assert isinstance(y1_hat_proba_a, list)
                assert isinstance(y2_hat_proba_a, list)
                assert len(y1_hat_proba_a) == nb_outputs
                assert len(y2_hat_proba_a) == nb_outputs

                for j in range(nb_outputs):
                    # correct shape
                    assert y1_hat_proba_a[j].shape == (y1.shape[0],
                                                       len(
                                                           np.unique(
                                                               y1_np[:, j])))
                    assert y2_hat_proba_a[j].shape[0] == df2_conv.shape[0]
                    assert y2_hat_proba_a[j].shape[1] == y1_hat_proba_a[
                        j].shape[1]

                    # between 0 and 1
                    assert y1_hat_proba_a[j].min() >= 0
                    assert y1_hat_proba_a[j].max() <= 1

                    assert y2_hat_proba_a[j].min() >= 0
                    assert y2_hat_proba_a[j].max() <= 1

                    # sum = 1
                    assert np.abs(y1_hat_proba_a[j].sum(axis=1) -
                                  1).max() <= 10**(-5)
                    assert np.abs(y2_hat_proba_a[j].sum(axis=1) -
                                  1).max() <= 10**(-5)

            else:
                # correct shape
                assert y1_hat_proba_a.shape == (y1.shape[0],
                                                len(np.unique(y1_np)))
                assert y2_hat_proba_a.shape == (df2_conv.shape[0],
                                                len(np.unique(y1_np)))

                # between 0 and 1
                assert y1_hat_proba_a.min() >= 0
                assert y1_hat_proba_a.max() <= 1

                assert y2_hat_proba_a.min() >= 0
                assert y2_hat_proba_a.max() <= 1

                # sum = 1
                assert np.abs(y1_hat_proba_a.sum(axis=1) - 1).max() <= 10**(-5)
                assert np.abs(y2_hat_proba_a.sum(axis=1) - 1).max() <= 10**(-5)

        assert y1_hat_a is not None  # verify that something was created
        assert y2_hat_a is not None  # verify that something was created

        assert y1_hat_a.shape == y1.shape
        assert y2_hat_a.shape[0] == df2_conv.shape[0]
        assert y2_hat_a.shape[1:] == y1.shape[1:]

        if model_is_classifier:
            assert hasattr(encoder_a, "classes_")
            if is_multiple_output:
                assert len(encoder_a.classes_) == nb_outputs
                for j in range(nb_outputs):
                    assert list(encoder_a.classes_[j]) == list(
                        np.unique(y1_np[:, j]))
            else:
                assert list(encoder_a.classes_) == list(np.unique(y1_np))

        # Verify that get_params didn't change after fit
        # Rmk : might not be enforced on all transformers
        params_3 = encoder_a.get_params()
        rec_assert_equal(params_0, params_3)

        encoder_cloned = clone(encoder_a)  # Clone again ...

        assert_raise_not_fitted(
            encoder_cloned, df2_conv
        )  # ... and verify that the clone isn't fitted, even if encoder_a is fitted

        # Same thing but using ... fit and then... transformed
        encoder_b = klass(**enc_kwargs)
        encoder_b.fit(df1_conv, y=y1)
        y1_hat_b = encoder_b.predict(df1_conv)
        y2_hat_b = encoder_b.predict(df2_conv)

        assert y1_hat_a is not None
        assert y2_hat_b is not None
        assert y1_hat_b.shape == y1.shape

        assert y2_hat_b.shape[0] == df2_conv.shape[0]
        assert y2_hat_b.shape[1:] == y1.shape[1:]

        # Same thing but using clone
        encoder_c = clone(encoder_a)
        y1_hat_c = encoder_c.fit(df1_conv, y=y1).predict(df1_conv)
        y2_hat_c = encoder_c.predict(df2_conv)

        assert y1_hat_c.shape == y1.shape

        assert y2_hat_c.shape[0] == df2_conv.shape[0]
        assert y2_hat_c.shape[1:] == y1.shape[1:]

        encoder_d = klass()
        encoder_d.set_params(**enc_kwargs)
        y1_hat_d = encoder_d.fit(df1_conv, y=y1).predict(df1_conv)
        y2_hat_d = encoder_d.predict(df2_conv)

        assert y1_hat_d.shape == y1.shape

        assert y2_hat_d.shape[0] == df2_conv.shape[0]
        assert y2_hat_d.shape[1:] == y1.shape[1:]

        assert_raise_value_error(encoder_a, gen_slice(df1_conv, slice(1,
                                                                      None)))
        assert_raise_value_error(encoder_b, gen_slice(df1_conv, slice(1,
                                                                      None)))
        assert_raise_value_error(encoder_c, gen_slice(df1_conv, slice(1,
                                                                      None)))
        assert_raise_value_error(encoder_d, gen_slice(df1_conv, slice(1,
                                                                      None)))
def test_score_from_params(x_data_type, shuffle, graph_pipeline):
    np.random.seed(123)
    X = np.random.randn(100, 10)

    X = convert_generic(X, output_type=x_data_type)

    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if shuffle:
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    scoring = ["silhouette", "davies_bouldin", "calinski_harabasz"]

    if graph_pipeline:
        estimator = GraphPipeline(
            {"pt": DebugPassThrough(), "lg": KMeans(n_clusters=3, random_state=123)}, edges=[("pt", "lg")]
        )
    else:
        estimator = KMeans(n_clusters=3, random_state=123)

    ##################
    ### Only score ###
    ##################

    res = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0)

    assert isinstance(res, pd.DataFrame)
    assert res.shape[0] == 1
    for s in scoring:
        assert ("test_" + s) in set(res.columns)

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ##########################
    ### Score + Prediction ###
    ##########################
    res, label = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0, return_predict=True)

    assert isinstance(res, pd.DataFrame)
    assert res.shape[0] == 1
    for s in scoring:
        assert ("test_" + s) in set(res.columns)

    assert isinstance(label, np.ndarray)

    assert len(np.unique(label)) == 3

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ####################
    ### Predict only ###
    ####################
    res, label = score_from_params_clustering(
        estimator, X, scoring=scoring, verbose=0, return_predict=True, no_scoring=True
    )

    assert len(np.unique(label)) == 3
    assert res is None

    with pytest.raises(NotFittedError):
        estimator.predict(X)
def test_approx_cross_validation_early_stop(
    add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups
):

    X, y = make_classification(n_samples=100, random_state=123)

    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    if add_third_class:
        y[0:2] = 2

    X = convert_generic(X, output_type=x_data_type)
    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if shuffle:
        np.random.seed(123)
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)
        y = y[ii]

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    if y_string_class:
        y = np.array(["CL_%d" % i for i in y])

    if add_third_class:
        scoring = ["accuracy"]
    else:
        scoring = ["accuracy", "neg_log_loss"]

    if graph_pipeline:
        estimator = GraphPipeline(
            {"pt": DebugPassThrough(), "lg": LogisticRegression(C=1, random_state=123)}, edges=[("pt", "lg")]
        )
    else:
        estimator = LogisticRegression(C=1, random_state=123)

    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        groups,
        cv=10,
        scoring=scoring,
        verbose=0,
        return_predict=True,
        method="predict",
        stopping_round=1,
        stopping_threshold=1.01,  # So that accuracy is sure to be bellow
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 2
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert yhat is None

    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        groups,
        cv=10,
        scoring=scoring,
        verbose=0,
        return_predict=True,
        method="predict",
        stopping_round=1,
        stopping_threshold=0.0,
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert yhat.ndim == 1
    assert len(np.setdiff1d(yhat, y)) == 0
def test_cross_validation(add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups):

    X, y = make_classification(n_samples=100, random_state=123)
    if with_groups:
        groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25)
    else:
        groups = None

    X = convert_generic(X, output_type=x_data_type)
    if x_data_type == DataTypes.DataFrame:
        X.columns = ["col_%d" % i for i in range(X.shape[1])]

    if add_third_class:
        y[0:2] = 2

    if shuffle:
        np.random.seed(123)
        ii = np.arange(X.shape[0])
        np.random.shuffle(ii)
        y = y[ii]

        if isinstance(X, pd.DataFrame):
            X = X.loc[ii, :]
        else:
            X = X[ii, :]

    if y_string_class:
        y = np.array(["CL_%d" % i for i in y])

    if add_third_class:
        scoring = ["accuracy"]
    else:
        scoring = ["accuracy", "neg_log_loss"]

    if graph_pipeline:
        estimator = GraphPipeline({"pt": DebugPassThrough(), "lg": LogisticRegression()}, edges=[("pt", "lg")])
    else:
        estimator = LogisticRegression()

    ##################
    ### Only score ###
    ##################

    cv_res = cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0)

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    #####################
    ### Score + Proba ###
    #####################
    cv_res, yhat_proba = cross_validation(
        estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert isinstance(yhat_proba, pd.DataFrame)
    if isinstance(X, pd.DataFrame):
        assert (yhat_proba.index == X.index).all()

    assert yhat_proba.shape == (y.shape[0], 2 + 1 * add_third_class)
    assert yhat_proba.min().min() >= 0
    assert yhat_proba.max().max() <= 1
    assert list(yhat_proba.columns) == list(np.sort(np.unique(y)))

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    #######################
    ### Score + Predict ###
    #######################
    cv_res, yhat = cross_validation(
        estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict"
    )

    assert isinstance(cv_res, pd.DataFrame)
    assert cv_res.shape[0] == 10
    for s in scoring:
        assert ("test_" + s) in set(cv_res.columns)
        assert ("train_" + s) in set(cv_res.columns)

    assert yhat.ndim == 1
    assert len(np.setdiff1d(yhat, y)) == 0

    assert yhat.shape[0] == y.shape[0]

    with pytest.raises(NotFittedError):
        estimator.predict(X)

    ####################
    ### Predict only ###
    ####################
    cv_res, yhat = cross_validation(
        estimator,
        X,
        y,
        groups,
        cv=10,
        scoring=scoring,
        verbose=0,
        return_predict=True,
        method="predict",
        no_scoring=True,
    )

    assert yhat.shape[0] == y.shape[0]

    assert cv_res is None
    assert yhat.ndim == 1
    assert len(np.setdiff1d(yhat, y)) == 0

    with pytest.raises(NotFittedError):
        estimator.predict(X)
Beispiel #18
0
def verif_encoder_with_data(klass, enc_kwargs, df1, df2, y1, fit_type,
                            additional_conversion_fun, extended_all_types):
    """ verification of the behavior of a transform on data """
    # Conversion of input into a different type
    df1_conv = convert_generic(df1, output_type=fit_type)
    df2_conv = convert_generic(df2, output_type=fit_type)

    if additional_conversion_fun is not None:
        df1_conv = additional_conversion_fun(df1_conv)
        df2_conv = additional_conversion_fun(df2_conv)

    if y1 is None:
        encoder = klass(**enc_kwargs)
        df1_transformed_a = encoder.fit_transform(
            df1_conv)  # 1st test without explicity an y..
        df2_transformed_a = encoder.transform(df2_conv)

    encoder_a = klass(**enc_kwargs)
    params_0 = encoder_a.get_params()

    df1_transformed_a = encoder_a.fit_transform(
        df1_conv, y=y1)  # Other test with an y (might be None or not)
    df2_transformed_a = encoder_a.transform(df2_conv)

    params_3 = encoder_a.get_params()
    # Rmk : might no be enforce ON all transformeurs
    rec_assert_equal(params_0,
                     params_3)  # Verif that get_params didn't change after fit

    assert df1_transformed_a is not None  # verify that something was created
    assert df2_transformed_a is not None  # verify that something was created

    encoder_cloned = clone(encoder_a)  # Clone again ...

    assert_raise_not_fitted(
        encoder_cloned, df2_conv
    )  # ... and verify that the clone isn't fitted, even if encoder_a is fitted

    # Same thing but using ... fit and then... transformed
    encoder_b = klass(**enc_kwargs)
    encoder_b.fit(df1_conv, y=y1)
    df1_transformed_b = encoder_b.transform(df1_conv)
    df2_transformed_b = encoder_b.transform(df2_conv)

    assert df1_transformed_b is not None
    assert df2_transformed_b is not None

    # Same thing but using clone
    encoder_c = clone(encoder_a)
    df1_transformed_c = encoder_c.fit_transform(df1_conv, y=y1)
    df2_transformed_c = encoder_c.transform(df2_conv)

    # Samething but using empyt class + set_params
    encoder_d = klass()
    encoder_d.set_params(**enc_kwargs)
    df1_transformed_d = encoder_d.fit_transform(df1_conv, y=y1)
    df2_transformed_d = encoder_d.transform(df2_conv)

    # Verif that when passed with the wrong number of columns
    assert_raise_value_error(encoder_a, gen_slice(df1_conv, slice(1, None)))
    assert_raise_value_error(encoder_b, gen_slice(df1_conv, slice(1, None)))
    assert_raise_value_error(encoder_c, gen_slice(df1_conv, slice(1, None)))
    assert_raise_value_error(encoder_d, gen_slice(df1_conv, slice(1, None)))

    for fit_type2, additional_conversion_fun2 in extended_all_types:

        if fit_type == fit_type2:
            continue

        df1_conv2 = convert_generic(df1_conv, output_type=fit_type2)

        # Verif that is I have a different type that what was present during the fit I'll raise an error

        assert_raise_value_error(encoder_a, df1_conv2)
        assert_raise_value_error(encoder_b, df1_conv2)
        assert_raise_value_error(encoder_c, df1_conv2)
        assert_raise_value_error(encoder_d, df1_conv2)

    # Verif shape
    # Nb of rows ...
    assert df1_transformed_a.shape[0] == df1_conv.shape[0]
    assert df1_transformed_b.shape[0] == df1_conv.shape[0]
    assert df1_transformed_c.shape[0] == df1_conv.shape[0]
    assert df1_transformed_d.shape[0] == df1_conv.shape[0]

    assert df2_transformed_a.shape[0] == df2_conv.shape[0]
    assert df2_transformed_b.shape[0] == df2_conv.shape[0]
    assert df2_transformed_c.shape[0] == df2_conv.shape[0]
    assert df2_transformed_d.shape[0] == df2_conv.shape[0]

    # Nb of columns : all the same
    assert df1_transformed_b.shape[1] == df1_transformed_a.shape[1]
    assert df1_transformed_c.shape[1] == df1_transformed_a.shape[1]
    assert df1_transformed_d.shape[1] == df1_transformed_a.shape[1]

    assert df2_transformed_a.shape[1] == df1_transformed_a.shape[1]
    assert df2_transformed_b.shape[1] == df1_transformed_a.shape[1]
    assert df2_transformed_c.shape[1] == df1_transformed_a.shape[1]
    assert df2_transformed_d.shape[1] == df1_transformed_a.shape[1]

    # Verif type
    assert get_type(df2_transformed_a) == get_type(df1_transformed_a)

    assert get_type(df1_transformed_b) == get_type(df1_transformed_a)
    assert get_type(df2_transformed_b) == get_type(df1_transformed_a)

    assert get_type(df1_transformed_c) == get_type(df1_transformed_a)
    assert get_type(df2_transformed_c) == get_type(df1_transformed_a)

    assert get_type(df1_transformed_d) == get_type(df1_transformed_a)
    assert get_type(df2_transformed_d) == get_type(df1_transformed_a)

    # if 'desired_output_type' present, check output type is what it seems
    if "desired_output_type" in enc_kwargs:
        assert get_type(df1_transformed_a) == enc_kwargs["desired_output_type"]

    if getattr(encoder_a, "desired_output_type", None) is not None:
        assert get_type(df1_transformed_a) == encoder_a.desired_output_type

    # Verif columns
    if get_type(df1_transformed_b) in (DataTypes.DataFrame,
                                       DataTypes.SparseDataFrame):
        assert list(df2_transformed_a.columns) == list(
            df1_transformed_a.columns)

        assert list(df1_transformed_b.columns) == list(
            df1_transformed_a.columns)
        assert list(df2_transformed_b.columns) == list(
            df1_transformed_a.columns)

        assert list(df1_transformed_c.columns) == list(
            df1_transformed_a.columns)
        assert list(df2_transformed_c.columns) == list(
            df1_transformed_a.columns)

        assert list(df2_transformed_d.columns) == list(
            df1_transformed_a.columns)
        assert list(df1_transformed_d.columns) == list(
            df1_transformed_a.columns)

        assert encoder_a.get_feature_names() == list(df1_transformed_a.columns)
        assert encoder_b.get_feature_names() == list(df1_transformed_a.columns)
        assert encoder_c.get_feature_names() == list(df1_transformed_a.columns)
        assert encoder_d.get_feature_names() == list(df1_transformed_a.columns)

    # Verif index
    if get_type(df1_transformed_b) in (DataTypes.DataFrame,
                                       DataTypes.SparseDataFrame):
        assert (df1_transformed_b.index == df1_transformed_a.index).all()
        assert (df2_transformed_b.index == df2_transformed_a.index).all()

        assert (df1_transformed_c.index == df1_transformed_a.index).all()
        assert (df2_transformed_c.index == df2_transformed_a.index).all()

        assert (df1_transformed_d.index == df1_transformed_a.index).all()
        assert (df2_transformed_d.index == df2_transformed_a.index).all()

        if fit_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            assert (df1_transformed_a.index == df1_conv.index).all()
            assert (df2_transformed_a.index == df2_conv.index).all()
Beispiel #19
0
    def _fit_transform(self, X, y, is_fit, is_transform, fit_params=None):
        """ internal method that handle the fit and the transform """

        if fit_params is None:
            fit_params = {}

        if is_fit:
            if isinstance(self.columns_to_use, str) and self.columns_to_use == "auto":
                columns = self._get_default_columns_to_use(X, y)
                self.selector = ColumnsSelector(columns_to_use=columns)
            else:
                self.selector = ColumnsSelector(columns_to_use=self.columns_to_use, regex_match=self.regex_match)

        if hasattr(X, "shape"):
            if X.shape[0] == 0:
                raise ValueError("the X object has 0 rows")

        Xindex = dsh._get_index(X)  # if X has an index retrieve it
        #        if self.columns_to_use is not None:
        if is_fit:
            Xsubset = self.selector.fit_transform(X)
        else:
            Xsubset = self.selector.transform(X)
        # TODO (maybe): here allow a preprocessing pipeline
        #        if self.has_preprocessing:
        #            if is_fit:
        #                self.preprocessing = self._get_preprocessing()
        #                Xsubset = self.preprocessing.fit_transform(Xsubset)
        #            else:
        #                Xsubset = self.preprocessing.transform(Xsubset)

        # Store columns and shape BEFORE any modification
        if self.selector is not None:
            Xsubset_columns = self.selector.get_feature_names()
        else:
            raise NotImplementedError("should not go there anymore")
            # Xsubset_columns = getattr(Xsubset, "columns", None)

        Xsubset_shape = getattr(Xsubset, "shape", None)
        # TODO : ici utiliser d'une facon ou d'une autre un '
        # https://github.com/scikit-learn/scikit-learn/issues/6425

        if is_fit:
            self._expected_type = dsh.get_type(Xsubset)
            self._expected_nbcols = dsh._nbcols(Xsubset)
            self._expected_columns = dsh._get_columns(Xsubset)

        else:
            Xtype = dsh.get_type(Xsubset)
            if Xtype != self._expected_type:
                raise ValueError(
                    "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype)
                )

            nbcols = dsh._nbcols(Xsubset)
            if nbcols != self._expected_nbcols:
                raise ValueError(
                    "I don't have the correct nb of colmns as input, expected : %d, got : %d"
                    % (self._expected_nbcols, nbcols)
                )

            columns = dsh._get_columns(Xsubset)
            expected_columns = getattr(self, "_expected_columns", None)  # to allow pickle compatibility

            if expected_columns is not None and columns is not None and columns != self._expected_columns:
                raise ValueError("I don't have the correct names of columns")

        if self.accepted_input_types is not None and self._expected_type not in self.accepted_input_types:
            Xsubset = dsh.convert_generic(
                Xsubset, mapped_type=self._expected_type, output_type=self.accepted_input_types[0]
            )

        if is_fit:
            self._verif_params()
            self._empty_data = False
            s = getattr(Xsubset, "shape", None)
            if s is not None and len(s) > 1 and s[1] == 0:
                self._empty_data = True

        if self.all_columns_at_once or self._empty_data:

            if is_fit:
                self._model = self._get_model(Xsubset, y)

            ##############################################
            ### Apply the model on ALL columns at ONCE ###
            ##############################################

            if self.work_on_one_column_only:
                Xsubset = dsh.make1dimension(Xsubset)  # will generate an error if 2 dimensions
            else:
                Xsubset = dsh.make2dimensions(Xsubset)

            # Call to underlying model
            Xres = None
            if is_fit and is_transform:
                ##############################
                ###  fit_transform method  ###
                ##############################
                # test if the the data to transform actually has some columns

                if not self._empty_data:
                    # normal case
                    Xres = self._model.fit_transform(Xsubset, y, **fit_params)
                else:
                    # It means there is no columns to transform
                    Xres = Xsubset  # don't do anything

            elif is_fit and not is_transform:
                ####################
                ###  fit method  ###
                ####################
                if self.must_transform_to_get_features_name:
                    Xres = self._model.fit_transform(Xsubset, y, **fit_params)
                else:
                    self._model.fit(Xsubset, y, **fit_params)
            else:
                ####################
                ###  transform   ###
                ####################
                if not self._empty_data:
                    Xres = self._model.transform(Xsubset)
                else:
                    Xres = Xsubset

            if is_fit:
                self._columns_informations = {
                    "output_columns": getattr(Xres, "columns", None),  # names of transformed columns if exist
                    "output_shape": getattr(Xres, "shape", None),  # shape of transformed result if exist
                    "input_columns": Xsubset_columns,  # name of input columns
                    "input_shape": Xsubset_shape,  # shape of input data
                }

                self._feature_names_for_transform = self.try_to_find_feature_names_all_at_once(
                    output_columns=self._columns_informations["output_columns"],
                    output_shape=self._columns_informations["output_shape"],
                    input_columns=self._columns_informations["input_columns"],
                    input_shape=self._columns_informations["input_shape"],
                )

                # self.kept_features_names = None  # for now

            if is_transform:
                Xres = dsh.convert_generic(Xres, output_type=self.desired_output_type)
                Xres = dsh._set_index(Xres, Xindex)

        else:
            ########################################
            ### Apply the model COLUMN BY COLUMN ###
            ########################################
            if is_fit:
                self._models = []

            if is_transform or self.must_transform_to_get_features_name:
                all_Xres = []
            else:
                all_Xres = None

            Xsubset = dsh.make2dimensions(Xsubset)

            for j in range(self._expected_nbcols):

                if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame, DataTypes.Serie):
                    Xsubset_j = Xsubset.iloc[:, j]
                else:
                    Xsubset_j = Xsubset[:, j]

                if is_fit:
                    sub_model = self._get_model(Xsubset, y)
                    self._models.append(sub_model)
                else:
                    sub_model = self._models[j]

                if not self.work_on_one_column_only:
                    Xsubset_j = dsh.make2dimensions(Xsubset_j)

                if is_fit and is_transform:
                    # fit_transform method
                    Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params)

                    all_Xres.append(Xres_j)

                elif is_fit and not is_transform:
                    # fit method
                    if self.must_transform_to_get_features_name:
                        Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params)
                        all_Xres.append(Xres_j)

                    else:
                        sub_model.fit(Xsubset_j, y, **fit_params)

                elif is_transform:
                    # transform method

                    Xres_j = sub_model.transform(Xsubset_j)
                    all_Xres.append(Xres_j)

            if is_fit:

                self._columns_informations = {
                    "all_output_columns": None
                    if all_Xres is None
                    else [getattr(Xres, "columns", None) for Xres in all_Xres],
                    "all_output_shape": None
                    if all_Xres is None
                    else [getattr(Xres, "shape", None) for Xres in all_Xres],
                    "input_columns": Xsubset_columns,  # name of input columns
                    "input_shape": Xsubset_shape,  # shape of input data
                }

                self._feature_names_for_transform = list(
                    self.try_to_find_feature_names_separate(
                        all_output_columns=self._columns_informations["all_output_columns"],
                        all_output_shape=self._columns_informations["all_output_shape"],
                        input_columns=self._columns_informations["input_columns"],
                        input_shape=self._columns_informations["input_shape"],
                    )
                )

                # self.kept_features_names = None  # for now

            if is_transform:
                Xres = dsh.generic_hstack(all_Xres, output_type=self.desired_output_type)
                Xres = dsh._set_index(Xres, Xindex)

        if is_transform:
            if self._feature_names_for_transform is not None:
                ### LA ca marche pas en transform !!!
                Xres = dsh._set_columns(Xres, self._feature_names_for_transform)

        if is_transform:
            return Xres
        else:
            return self
Beispiel #20
0
def check_no_null(df_transformed, df=None):
    df_transformed2 = convert_generic(df_transformed,
                                      output_type=DataTypes.DataFrame)
    assert df_transformed2.isnull().sum().sum() == 0