def test_sparse_df_int(self): Xsparse_int = convert_generic( X, output_type=DataTypes.SparseArray).astype(np.int32) Xsparse_df_int = convert_generic(Xsparse_int, output_type=DataTypes.SparseDataFrame) lgbm = LGBMClassifier() lgbm.fit(Xsparse_df_int, y)
def verif_TruncatedSVDWrapperSparseData(use_wrapper): if use_wrapper: klass = TruncatedSVDWrapper else: klass = TruncatedSVD np.random.seed(123) X1 = np.random.randn(50, 10) df1 = convert_generic(X1, output_type=DataTypes.SparseDataFrame) # ok : array svd = klass(n_components=2) svd.fit(X1) # ok : dataframe with sparse float svd = klass(n_components=2) svd.fit(df1) # ok : dataframe with sparse int X2 = np.random.randint(0, 10, (50, 10)) df2 = convert_generic(X2, output_type=DataTypes.SparseDataFrame) svd = klass(n_components=2) svd.fit(df2) # fails : mix sparse int and sparse float df = pd.concat((df1, df2), axis=1) df.columns = list(range(df.shape[1])) svd = klass(n_components=2) svd.fit(df)
def check_all_numerical(df_transformed, df=None): mat = convert_generic(df_transformed, output_type=DataTypes.NumpyArray) has_error = False try: mat.astype(np.float64) except ValueError: has_error = True assert not has_error # So that I explicty raise an assertion
def test_approx_cross_validation_transformer(x_data_type, shuffle, graph_pipeline, with_groups): if graph_pipeline: estimator = GraphPipeline({"ptA": DebugPassThrough(), "ptB": DebugPassThrough()}, edges=[("ptA", "ptB")]) else: estimator = DebugPassThrough() X, y = make_classification(n_samples=100, random_state=123) if with_groups: groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) else: groups = None X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if shuffle: np.random.seed(123) ii = np.arange(X.shape[0]) np.random.shuffle(ii) y = y[ii] if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] scoring = ["accuracy", "neg_log_loss"] ################## ### Score only ### ################## with pytest.raises(Exception): cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0) # shouldn't work since DebugPassThrough can't be scored ################# ### Transform ### ################# cv_res, Xhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, no_scoring=True ) assert type(Xhat) == type(X) assert cv_res is None assert Xhat.shape == X.shape if isinstance(X, pd.DataFrame): assert (Xhat.index == X.index).all() assert (Xhat.columns == X.columns).all() if isinstance(X, pd.DataFrame): assert np.abs(Xhat - X).max().max() <= 10 ** (10 - 10) else: assert np.max(np.abs(Xhat - X)) <= 10 ** (-10)
def test_convert_to_array_when_sparse(): array = np.zeros((10, 3), dtype=np.float32) df1 = pd.DataFrame(array, columns=["A", "B", "C"]) df1_sparse = convert_generic(df1, output_type=DataTypes.SparseDataFrame) df1_array = convert_generic(df1_sparse, output_type=DataTypes.NumpyArray) assert (df1_array == array).all() assert df1_array.dtype == array.dtype df1 = pd.DataFrame({ "int_col": np.zeros(10, dtype=np.int64), "float_col": np.zeros(10, dtype=np.float64) }) df1_sparse = convert_generic(df1, output_type=DataTypes.SparseDataFrame) df1_array = convert_generic(df1_sparse, output_type=DataTypes.NumpyArray) assert (df1_array == 0).all() assert df1_array.dtype == np.float64
def _convert_sparse(x, sparse): if isinstance(x, pd.Series): if _IS_PD1 and sparse: return convert_to_sparseserie(x) else: return x # nothing, I don't want to test sparse elif isinstance(x, pd.DataFrame): return convert_generic(x, output_type=DataTypes.SparseDataFrame) else: TypeError("This function is for DataFrame or Serie")
def _prepare_target(self, y, klass, conversion_type): """ prepare the target so that it can be given to the underlying model to use Parameters ---------- y : array the original target klass : type the encoder to use for the target conversion_type : DataType the output type desired by the target Set --- self._mono_target : bool does the original problem as one target or not self._target_encoded : the encoder used on the target Returns -------- y_encoded : array the modified target """ self._mono_target = y.ndim == 1 self._target_dtype = y.dtype if isinstance(self.classes, str) and self.classes == "auto": categories = "auto" else: if self._mono_target: categories = [self.classes ] # because OrdinalEncoder expect a list else: if not isinstance(self.classes, list): raise TypeError( "For multi-target classes should be a list, instead I got %s" % str(type(self.classes))) categories = self.classes self._target_encoder = klass(categories=categories, dtype=np.int32) yd2 = convert_generic(make2dimensions(y), output_type=conversion_type) if conversion_type == DataTypes.NumpyArray and yd2.dtype.kind == 'U': yd2 = yd2.astype(np.object, copy=False) y_encoded = self._target_encoder.fit_transform(yd2) return y_encoded
def fit(self, X, y): if not is_classifier(self.classifier): raise TypeError("classifier should be a sklearn classifier") y_int = self._prepare_target(y, klass=_OrdinalOneHotEncoder, conversion_type=DataTypes.DataFrame) y_int = convert_generic(y_int, output_type=DataTypes.NumpyArray) self.classifier.fit(X, y_int) return self
def transform(self, X): if not self._already_fitted: raise NotFittedError( "This %s instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method." % type(self).__name__) if self._is_classifier: predictions = self.model.predict_proba(X) else: predictions = self.model.predict(X) res = self._format_predictions(predictions, is_classifier=self._is_classifier, target_info=self._target_info) res = convert_generic(res, output_type=self.desired_output_type) if hasattr(res, "columns"): res.columns = self.get_feature_names() return res
def test_sparse_df(self): Xsparse_df = convert_generic(X, output_type=DataTypes.SparseDataFrame) lgbm = LGBMClassifier() lgbm.fit(Xsparse_df, y)
def test_sparse(self): Xsparse = convert_generic(X, output_type=DataTypes.SparseArray) lgbm = LGBMClassifier() lgbm.fit(Xsparse, y)
def verif_model(df1, df2, y1, klass, model_kwargs, all_types, is_classifier): """ helper function that check (using asserts) a bunch a thing on a model klass Parameters ---------- df1 : array like data on which model will be trained df2 : array like data on which model will be tested klass : type type of the model to test model_kwargs : dict kwargs to be passed to klass to create a model all_types : list of type list of input type to test the models on is_classifier : boolean if True the model is a Classifier otherwise a Regressor """ if not isinstance(all_types, (list, tuple)): all_types = (all_types, ) model0 = klass(**model_kwargs) # Create an object ... model1 = clone(model0) # then try to clone it model2 = klass() # Create an empty object and then set its params model2.set_params(**model_kwargs) # Verify type are iddentical assert type(model0) == type(model1) assert type(model0) == type(model2) assert hasattr(klass, "fit") assert hasattr(klass, "predict") if is_classifier: assert hasattr(klass, "predict_proba") # Verify get_params are identical params_0 = model0.get_params() params_1 = model1.get_params() params_2 = model2.get_params() rec_assert_equal(params_0, params_1) rec_assert_equal(params_0, params_2) rec_assert_equal({k: v for k, v in params_0.items() if k in model_kwargs}, model_kwargs) rec_assert_equal({k: v for k, v in params_1.items() if k in model_kwargs}, model_kwargs) rec_assert_equal({k: v for k, v in params_2.items() if k in model_kwargs}, model_kwargs) extended_all_types = extend_all_type(all_types) if is_classifier: yclasses = list(set(np.unique(y1))) nb_classes = len(yclasses) for fit_type, additional_conversion_fun in extended_all_types: # Convert inputs into several type .. df1_conv = convert_generic(df1, output_type=fit_type) df2_conv = convert_generic(df2, output_type=fit_type) if additional_conversion_fun is not None: df1_conv = additional_conversion_fun(df1_conv) df2_conv = additional_conversion_fun(df2_conv) model_a = klass(**model_kwargs) model_a.fit(df1_conv, y=y1) y1_hat_a = model_a.predict( df1_conv) # Other test with an y (might be None or not) y2_hat_a = model_a.predict(df2_conv) if is_classifier: y1_hatproba_a = model_a.predict_proba(df1_conv) y2_hatproba_a = model_a.predict_proba(df2_conv) params_3 = model_a.get_params( ) # Verif that get_params didn't change after fit # Rmk : might no be enforce ON all transformeurs rec_assert_equal(params_0, params_3) assert y1_hat_a is not None # verify that something was created assert y2_hat_a is not None # verify that something was created model_cloned = clone(model_a) # Clone again ... assert_raise_not_fitted( model_cloned, df2_conv ) # ... and verify that the clone isn't fitted, even if model_a is fitted # Same thing but using clone model_b = clone(model_a) model_b.fit(df1_conv, y=y1) y1_hat_b = model_b.predict(df1_conv) y2_hat_b = model_b.predict(df2_conv) if is_classifier: y1_hatproba_b = model_b.predict_proba(df1_conv) y2_hatproba_b = model_b.predict_proba(df2_conv) # Same thing but with set_params model_c = klass() model_c.set_params(**model_kwargs) model_c.fit(df1_conv, y=y1) y1_hat_c = model_c.predict(df1_conv) y2_hat_c = model_c.predict(df2_conv) if is_classifier: y1_hatproba_c = model_c.predict_proba(df1_conv) y2_hatproba_c = model_c.predict_proba(df2_conv) # check error when call with too few columns assert_raise_value_error(model_a, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(model_b, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(model_c, gen_slice(df1_conv, slice(1, None))) assert y1_hat_a.shape[0] == df1_conv.shape[0] assert y1_hat_b.shape[0] == df1_conv.shape[0] assert y1_hat_c.shape[0] == df1_conv.shape[0] assert y2_hat_a.shape[0] == df2_conv.shape[0] assert y2_hat_b.shape[0] == df2_conv.shape[0] assert y2_hat_c.shape[0] == df2_conv.shape[0] assert y1_hat_a.ndim == y1.ndim assert y1_hat_b.ndim == y1.ndim assert y1_hat_c.ndim == y1.ndim assert y2_hat_a.ndim == y1.ndim assert y2_hat_b.ndim == y1.ndim assert y2_hat_c.ndim == y1.ndim if is_classifier: assert y1_hatproba_a.ndim == 2 assert y1_hatproba_b.ndim == 2 assert y1_hatproba_c.ndim == 2 assert y2_hatproba_a.ndim == 2 assert y2_hatproba_b.ndim == 2 assert y2_hatproba_c.ndim == 2 y1_hatproba_a.shape[1] == nb_classes y1_hatproba_b.shape[1] == nb_classes y1_hatproba_c.shape[1] == nb_classes y2_hatproba_a.shape[1] == nb_classes y2_hatproba_b.shape[1] == nb_classes y2_hatproba_c.shape[1] == nb_classes assert hasattr(model_a, "classes_") assert hasattr(model_b, "classes_") assert hasattr(model_c, "classes_") assert list(set(model_a.classes_)) == list(set(yclasses)) assert list(set(model_b.classes_)) == list(set(yclasses)) assert list(set(model_c.classes_)) == list(set(yclasses)) for f in (check_all_numerical, check_between_01, check_no_null): f(y1_hatproba_a) f(y1_hatproba_b) f(y1_hatproba_c) f(y2_hatproba_a) f(y2_hatproba_b) f(y2_hatproba_c) # Verif type assert get_type(y1_hat_b) == get_type(y1_hat_a) assert get_type(y1_hat_c) == get_type(y1_hat_a) assert get_type(y2_hat_a) == get_type(y1_hat_a) assert get_type(y2_hat_b) == get_type(y1_hat_a) assert get_type(y2_hat_c) == get_type(y1_hat_a)
def check_between_01(df_transformed, df=None): xx = convert_generic(df_transformed, output_type=DataTypes.NumpyArray) assert xx.min() >= 0 assert xx.max() <= 1
def verif_model(df1, df2, y1, klass, enc_kwargs, all_types): """ function to test differents things on a model """ is_multiple_output = y1.ndim > 1 and y1.shape[1] >= 1 nb_outputs = 1 if is_multiple_output: nb_outputs = y1.shape[1] if not isinstance(all_types, (list, tuple)): all_types = (all_types, ) # all_types = (DataTypes.DataFrame, DataTypes.SparseDataFrame) assert hasattr(klass, "fit") assert hasattr(klass, "predict") encoder0 = klass(**enc_kwargs) # Create an object ... encoder1 = clone(encoder0) # then try to clone it encoder2 = klass() # Create an empty object and then set its params encoder2.set_params(**enc_kwargs) if is_classifier(encoder0): model_is_classifier = True assert hasattr(klass, "predict_proba") else: model_is_classifier = False assert is_regressor(encoder0) assert_raise_not_fitted(encoder0, df1) assert_raise_not_fitted(encoder1, df1) assert_raise_not_fitted(encoder2, df1) # Verify type are iddentical assert type(encoder0) == type(encoder1) assert type(encoder0) == type(encoder2) # Verify get_params are identical params_0 = encoder0.get_params() params_1 = encoder1.get_params() params_2 = encoder2.get_params() rec_assert_equal(params_0, params_1) rec_assert_equal(params_0, params_2) rec_assert_equal({k: v for k, v in params_0.items() if k in enc_kwargs}, enc_kwargs) rec_assert_equal({k: v for k, v in params_1.items() if k in enc_kwargs}, enc_kwargs) rec_assert_equal({k: v for k, v in params_2.items() if k in enc_kwargs}, enc_kwargs) extended_all_types = extend_all_type(all_types) def get_values(y): if hasattr(y, "values"): return y.values else: return y y1_np = get_values(y1) for fit_type, additional_conversion_fun in extended_all_types: # Convert inputs into several type .. df1_conv = convert_generic(df1, output_type=fit_type) df2_conv = convert_generic(df2, output_type=fit_type) if additional_conversion_fun is not None: df1_conv = additional_conversion_fun(df1_conv) df2_conv = additional_conversion_fun(df2_conv) encoder_a = klass(**enc_kwargs) y1_hat_a = encoder_a.fit(df1_conv, y=y1).predict( df1_conv) # Other test with an y (might be None or not) y2_hat_a = encoder_a.predict(df2_conv) if model_is_classifier: y1_hat_proba_a = encoder_a.predict_proba(df1_conv) y2_hat_proba_a = encoder_a.predict_proba(df2_conv) if is_multiple_output: assert isinstance(y1_hat_proba_a, list) assert isinstance(y2_hat_proba_a, list) assert len(y1_hat_proba_a) == nb_outputs assert len(y2_hat_proba_a) == nb_outputs for j in range(nb_outputs): # correct shape assert y1_hat_proba_a[j].shape == (y1.shape[0], len( np.unique( y1_np[:, j]))) assert y2_hat_proba_a[j].shape[0] == df2_conv.shape[0] assert y2_hat_proba_a[j].shape[1] == y1_hat_proba_a[ j].shape[1] # between 0 and 1 assert y1_hat_proba_a[j].min() >= 0 assert y1_hat_proba_a[j].max() <= 1 assert y2_hat_proba_a[j].min() >= 0 assert y2_hat_proba_a[j].max() <= 1 # sum = 1 assert np.abs(y1_hat_proba_a[j].sum(axis=1) - 1).max() <= 10**(-5) assert np.abs(y2_hat_proba_a[j].sum(axis=1) - 1).max() <= 10**(-5) else: # correct shape assert y1_hat_proba_a.shape == (y1.shape[0], len(np.unique(y1_np))) assert y2_hat_proba_a.shape == (df2_conv.shape[0], len(np.unique(y1_np))) # between 0 and 1 assert y1_hat_proba_a.min() >= 0 assert y1_hat_proba_a.max() <= 1 assert y2_hat_proba_a.min() >= 0 assert y2_hat_proba_a.max() <= 1 # sum = 1 assert np.abs(y1_hat_proba_a.sum(axis=1) - 1).max() <= 10**(-5) assert np.abs(y2_hat_proba_a.sum(axis=1) - 1).max() <= 10**(-5) assert y1_hat_a is not None # verify that something was created assert y2_hat_a is not None # verify that something was created assert y1_hat_a.shape == y1.shape assert y2_hat_a.shape[0] == df2_conv.shape[0] assert y2_hat_a.shape[1:] == y1.shape[1:] if model_is_classifier: assert hasattr(encoder_a, "classes_") if is_multiple_output: assert len(encoder_a.classes_) == nb_outputs for j in range(nb_outputs): assert list(encoder_a.classes_[j]) == list( np.unique(y1_np[:, j])) else: assert list(encoder_a.classes_) == list(np.unique(y1_np)) # Verify that get_params didn't change after fit # Rmk : might not be enforced on all transformers params_3 = encoder_a.get_params() rec_assert_equal(params_0, params_3) encoder_cloned = clone(encoder_a) # Clone again ... assert_raise_not_fitted( encoder_cloned, df2_conv ) # ... and verify that the clone isn't fitted, even if encoder_a is fitted # Same thing but using ... fit and then... transformed encoder_b = klass(**enc_kwargs) encoder_b.fit(df1_conv, y=y1) y1_hat_b = encoder_b.predict(df1_conv) y2_hat_b = encoder_b.predict(df2_conv) assert y1_hat_a is not None assert y2_hat_b is not None assert y1_hat_b.shape == y1.shape assert y2_hat_b.shape[0] == df2_conv.shape[0] assert y2_hat_b.shape[1:] == y1.shape[1:] # Same thing but using clone encoder_c = clone(encoder_a) y1_hat_c = encoder_c.fit(df1_conv, y=y1).predict(df1_conv) y2_hat_c = encoder_c.predict(df2_conv) assert y1_hat_c.shape == y1.shape assert y2_hat_c.shape[0] == df2_conv.shape[0] assert y2_hat_c.shape[1:] == y1.shape[1:] encoder_d = klass() encoder_d.set_params(**enc_kwargs) y1_hat_d = encoder_d.fit(df1_conv, y=y1).predict(df1_conv) y2_hat_d = encoder_d.predict(df2_conv) assert y1_hat_d.shape == y1.shape assert y2_hat_d.shape[0] == df2_conv.shape[0] assert y2_hat_d.shape[1:] == y1.shape[1:] assert_raise_value_error(encoder_a, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(encoder_b, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(encoder_c, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(encoder_d, gen_slice(df1_conv, slice(1, None)))
def test_score_from_params(x_data_type, shuffle, graph_pipeline): np.random.seed(123) X = np.random.randn(100, 10) X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if shuffle: ii = np.arange(X.shape[0]) np.random.shuffle(ii) if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] scoring = ["silhouette", "davies_bouldin", "calinski_harabasz"] if graph_pipeline: estimator = GraphPipeline( {"pt": DebugPassThrough(), "lg": KMeans(n_clusters=3, random_state=123)}, edges=[("pt", "lg")] ) else: estimator = KMeans(n_clusters=3, random_state=123) ################## ### Only score ### ################## res = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0) assert isinstance(res, pd.DataFrame) assert res.shape[0] == 1 for s in scoring: assert ("test_" + s) in set(res.columns) with pytest.raises(NotFittedError): estimator.predict(X) ########################## ### Score + Prediction ### ########################## res, label = score_from_params_clustering(estimator, X, scoring=scoring, verbose=0, return_predict=True) assert isinstance(res, pd.DataFrame) assert res.shape[0] == 1 for s in scoring: assert ("test_" + s) in set(res.columns) assert isinstance(label, np.ndarray) assert len(np.unique(label)) == 3 with pytest.raises(NotFittedError): estimator.predict(X) #################### ### Predict only ### #################### res, label = score_from_params_clustering( estimator, X, scoring=scoring, verbose=0, return_predict=True, no_scoring=True ) assert len(np.unique(label)) == 3 assert res is None with pytest.raises(NotFittedError): estimator.predict(X)
def test_approx_cross_validation_early_stop( add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups ): X, y = make_classification(n_samples=100, random_state=123) if with_groups: groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) else: groups = None if add_third_class: y[0:2] = 2 X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if shuffle: np.random.seed(123) ii = np.arange(X.shape[0]) np.random.shuffle(ii) y = y[ii] if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] if y_string_class: y = np.array(["CL_%d" % i for i in y]) if add_third_class: scoring = ["accuracy"] else: scoring = ["accuracy", "neg_log_loss"] if graph_pipeline: estimator = GraphPipeline( {"pt": DebugPassThrough(), "lg": LogisticRegression(C=1, random_state=123)}, edges=[("pt", "lg")] ) else: estimator = LogisticRegression(C=1, random_state=123) cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict", stopping_round=1, stopping_threshold=1.01, # So that accuracy is sure to be bellow ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 2 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert yhat is None cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict", stopping_round=1, stopping_threshold=0.0, ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert yhat.ndim == 1 assert len(np.setdiff1d(yhat, y)) == 0
def test_cross_validation(add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups): X, y = make_classification(n_samples=100, random_state=123) if with_groups: groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) else: groups = None X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if add_third_class: y[0:2] = 2 if shuffle: np.random.seed(123) ii = np.arange(X.shape[0]) np.random.shuffle(ii) y = y[ii] if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] if y_string_class: y = np.array(["CL_%d" % i for i in y]) if add_third_class: scoring = ["accuracy"] else: scoring = ["accuracy", "neg_log_loss"] if graph_pipeline: estimator = GraphPipeline({"pt": DebugPassThrough(), "lg": LogisticRegression()}, edges=[("pt", "lg")]) else: estimator = LogisticRegression() ################## ### Only score ### ################## cv_res = cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) with pytest.raises(NotFittedError): estimator.predict(X) ##################### ### Score + Proba ### ##################### cv_res, yhat_proba = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert isinstance(yhat_proba, pd.DataFrame) if isinstance(X, pd.DataFrame): assert (yhat_proba.index == X.index).all() assert yhat_proba.shape == (y.shape[0], 2 + 1 * add_third_class) assert yhat_proba.min().min() >= 0 assert yhat_proba.max().max() <= 1 assert list(yhat_proba.columns) == list(np.sort(np.unique(y))) with pytest.raises(NotFittedError): estimator.predict(X) ####################### ### Score + Predict ### ####################### cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict" ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert yhat.ndim == 1 assert len(np.setdiff1d(yhat, y)) == 0 assert yhat.shape[0] == y.shape[0] with pytest.raises(NotFittedError): estimator.predict(X) #################### ### Predict only ### #################### cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict", no_scoring=True, ) assert yhat.shape[0] == y.shape[0] assert cv_res is None assert yhat.ndim == 1 assert len(np.setdiff1d(yhat, y)) == 0 with pytest.raises(NotFittedError): estimator.predict(X)
def verif_encoder_with_data(klass, enc_kwargs, df1, df2, y1, fit_type, additional_conversion_fun, extended_all_types): """ verification of the behavior of a transform on data """ # Conversion of input into a different type df1_conv = convert_generic(df1, output_type=fit_type) df2_conv = convert_generic(df2, output_type=fit_type) if additional_conversion_fun is not None: df1_conv = additional_conversion_fun(df1_conv) df2_conv = additional_conversion_fun(df2_conv) if y1 is None: encoder = klass(**enc_kwargs) df1_transformed_a = encoder.fit_transform( df1_conv) # 1st test without explicity an y.. df2_transformed_a = encoder.transform(df2_conv) encoder_a = klass(**enc_kwargs) params_0 = encoder_a.get_params() df1_transformed_a = encoder_a.fit_transform( df1_conv, y=y1) # Other test with an y (might be None or not) df2_transformed_a = encoder_a.transform(df2_conv) params_3 = encoder_a.get_params() # Rmk : might no be enforce ON all transformeurs rec_assert_equal(params_0, params_3) # Verif that get_params didn't change after fit assert df1_transformed_a is not None # verify that something was created assert df2_transformed_a is not None # verify that something was created encoder_cloned = clone(encoder_a) # Clone again ... assert_raise_not_fitted( encoder_cloned, df2_conv ) # ... and verify that the clone isn't fitted, even if encoder_a is fitted # Same thing but using ... fit and then... transformed encoder_b = klass(**enc_kwargs) encoder_b.fit(df1_conv, y=y1) df1_transformed_b = encoder_b.transform(df1_conv) df2_transformed_b = encoder_b.transform(df2_conv) assert df1_transformed_b is not None assert df2_transformed_b is not None # Same thing but using clone encoder_c = clone(encoder_a) df1_transformed_c = encoder_c.fit_transform(df1_conv, y=y1) df2_transformed_c = encoder_c.transform(df2_conv) # Samething but using empyt class + set_params encoder_d = klass() encoder_d.set_params(**enc_kwargs) df1_transformed_d = encoder_d.fit_transform(df1_conv, y=y1) df2_transformed_d = encoder_d.transform(df2_conv) # Verif that when passed with the wrong number of columns assert_raise_value_error(encoder_a, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(encoder_b, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(encoder_c, gen_slice(df1_conv, slice(1, None))) assert_raise_value_error(encoder_d, gen_slice(df1_conv, slice(1, None))) for fit_type2, additional_conversion_fun2 in extended_all_types: if fit_type == fit_type2: continue df1_conv2 = convert_generic(df1_conv, output_type=fit_type2) # Verif that is I have a different type that what was present during the fit I'll raise an error assert_raise_value_error(encoder_a, df1_conv2) assert_raise_value_error(encoder_b, df1_conv2) assert_raise_value_error(encoder_c, df1_conv2) assert_raise_value_error(encoder_d, df1_conv2) # Verif shape # Nb of rows ... assert df1_transformed_a.shape[0] == df1_conv.shape[0] assert df1_transformed_b.shape[0] == df1_conv.shape[0] assert df1_transformed_c.shape[0] == df1_conv.shape[0] assert df1_transformed_d.shape[0] == df1_conv.shape[0] assert df2_transformed_a.shape[0] == df2_conv.shape[0] assert df2_transformed_b.shape[0] == df2_conv.shape[0] assert df2_transformed_c.shape[0] == df2_conv.shape[0] assert df2_transformed_d.shape[0] == df2_conv.shape[0] # Nb of columns : all the same assert df1_transformed_b.shape[1] == df1_transformed_a.shape[1] assert df1_transformed_c.shape[1] == df1_transformed_a.shape[1] assert df1_transformed_d.shape[1] == df1_transformed_a.shape[1] assert df2_transformed_a.shape[1] == df1_transformed_a.shape[1] assert df2_transformed_b.shape[1] == df1_transformed_a.shape[1] assert df2_transformed_c.shape[1] == df1_transformed_a.shape[1] assert df2_transformed_d.shape[1] == df1_transformed_a.shape[1] # Verif type assert get_type(df2_transformed_a) == get_type(df1_transformed_a) assert get_type(df1_transformed_b) == get_type(df1_transformed_a) assert get_type(df2_transformed_b) == get_type(df1_transformed_a) assert get_type(df1_transformed_c) == get_type(df1_transformed_a) assert get_type(df2_transformed_c) == get_type(df1_transformed_a) assert get_type(df1_transformed_d) == get_type(df1_transformed_a) assert get_type(df2_transformed_d) == get_type(df1_transformed_a) # if 'desired_output_type' present, check output type is what it seems if "desired_output_type" in enc_kwargs: assert get_type(df1_transformed_a) == enc_kwargs["desired_output_type"] if getattr(encoder_a, "desired_output_type", None) is not None: assert get_type(df1_transformed_a) == encoder_a.desired_output_type # Verif columns if get_type(df1_transformed_b) in (DataTypes.DataFrame, DataTypes.SparseDataFrame): assert list(df2_transformed_a.columns) == list( df1_transformed_a.columns) assert list(df1_transformed_b.columns) == list( df1_transformed_a.columns) assert list(df2_transformed_b.columns) == list( df1_transformed_a.columns) assert list(df1_transformed_c.columns) == list( df1_transformed_a.columns) assert list(df2_transformed_c.columns) == list( df1_transformed_a.columns) assert list(df2_transformed_d.columns) == list( df1_transformed_a.columns) assert list(df1_transformed_d.columns) == list( df1_transformed_a.columns) assert encoder_a.get_feature_names() == list(df1_transformed_a.columns) assert encoder_b.get_feature_names() == list(df1_transformed_a.columns) assert encoder_c.get_feature_names() == list(df1_transformed_a.columns) assert encoder_d.get_feature_names() == list(df1_transformed_a.columns) # Verif index if get_type(df1_transformed_b) in (DataTypes.DataFrame, DataTypes.SparseDataFrame): assert (df1_transformed_b.index == df1_transformed_a.index).all() assert (df2_transformed_b.index == df2_transformed_a.index).all() assert (df1_transformed_c.index == df1_transformed_a.index).all() assert (df2_transformed_c.index == df2_transformed_a.index).all() assert (df1_transformed_d.index == df1_transformed_a.index).all() assert (df2_transformed_d.index == df2_transformed_a.index).all() if fit_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame): assert (df1_transformed_a.index == df1_conv.index).all() assert (df2_transformed_a.index == df2_conv.index).all()
def _fit_transform(self, X, y, is_fit, is_transform, fit_params=None): """ internal method that handle the fit and the transform """ if fit_params is None: fit_params = {} if is_fit: if isinstance(self.columns_to_use, str) and self.columns_to_use == "auto": columns = self._get_default_columns_to_use(X, y) self.selector = ColumnsSelector(columns_to_use=columns) else: self.selector = ColumnsSelector(columns_to_use=self.columns_to_use, regex_match=self.regex_match) if hasattr(X, "shape"): if X.shape[0] == 0: raise ValueError("the X object has 0 rows") Xindex = dsh._get_index(X) # if X has an index retrieve it # if self.columns_to_use is not None: if is_fit: Xsubset = self.selector.fit_transform(X) else: Xsubset = self.selector.transform(X) # TODO (maybe): here allow a preprocessing pipeline # if self.has_preprocessing: # if is_fit: # self.preprocessing = self._get_preprocessing() # Xsubset = self.preprocessing.fit_transform(Xsubset) # else: # Xsubset = self.preprocessing.transform(Xsubset) # Store columns and shape BEFORE any modification if self.selector is not None: Xsubset_columns = self.selector.get_feature_names() else: raise NotImplementedError("should not go there anymore") # Xsubset_columns = getattr(Xsubset, "columns", None) Xsubset_shape = getattr(Xsubset, "shape", None) # TODO : ici utiliser d'une facon ou d'une autre un ' # https://github.com/scikit-learn/scikit-learn/issues/6425 if is_fit: self._expected_type = dsh.get_type(Xsubset) self._expected_nbcols = dsh._nbcols(Xsubset) self._expected_columns = dsh._get_columns(Xsubset) else: Xtype = dsh.get_type(Xsubset) if Xtype != self._expected_type: raise ValueError( "I don't have the correct type as input, expected : %s, got : %s" % (self._expected_type, Xtype) ) nbcols = dsh._nbcols(Xsubset) if nbcols != self._expected_nbcols: raise ValueError( "I don't have the correct nb of colmns as input, expected : %d, got : %d" % (self._expected_nbcols, nbcols) ) columns = dsh._get_columns(Xsubset) expected_columns = getattr(self, "_expected_columns", None) # to allow pickle compatibility if expected_columns is not None and columns is not None and columns != self._expected_columns: raise ValueError("I don't have the correct names of columns") if self.accepted_input_types is not None and self._expected_type not in self.accepted_input_types: Xsubset = dsh.convert_generic( Xsubset, mapped_type=self._expected_type, output_type=self.accepted_input_types[0] ) if is_fit: self._verif_params() self._empty_data = False s = getattr(Xsubset, "shape", None) if s is not None and len(s) > 1 and s[1] == 0: self._empty_data = True if self.all_columns_at_once or self._empty_data: if is_fit: self._model = self._get_model(Xsubset, y) ############################################## ### Apply the model on ALL columns at ONCE ### ############################################## if self.work_on_one_column_only: Xsubset = dsh.make1dimension(Xsubset) # will generate an error if 2 dimensions else: Xsubset = dsh.make2dimensions(Xsubset) # Call to underlying model Xres = None if is_fit and is_transform: ############################## ### fit_transform method ### ############################## # test if the the data to transform actually has some columns if not self._empty_data: # normal case Xres = self._model.fit_transform(Xsubset, y, **fit_params) else: # It means there is no columns to transform Xres = Xsubset # don't do anything elif is_fit and not is_transform: #################### ### fit method ### #################### if self.must_transform_to_get_features_name: Xres = self._model.fit_transform(Xsubset, y, **fit_params) else: self._model.fit(Xsubset, y, **fit_params) else: #################### ### transform ### #################### if not self._empty_data: Xres = self._model.transform(Xsubset) else: Xres = Xsubset if is_fit: self._columns_informations = { "output_columns": getattr(Xres, "columns", None), # names of transformed columns if exist "output_shape": getattr(Xres, "shape", None), # shape of transformed result if exist "input_columns": Xsubset_columns, # name of input columns "input_shape": Xsubset_shape, # shape of input data } self._feature_names_for_transform = self.try_to_find_feature_names_all_at_once( output_columns=self._columns_informations["output_columns"], output_shape=self._columns_informations["output_shape"], input_columns=self._columns_informations["input_columns"], input_shape=self._columns_informations["input_shape"], ) # self.kept_features_names = None # for now if is_transform: Xres = dsh.convert_generic(Xres, output_type=self.desired_output_type) Xres = dsh._set_index(Xres, Xindex) else: ######################################## ### Apply the model COLUMN BY COLUMN ### ######################################## if is_fit: self._models = [] if is_transform or self.must_transform_to_get_features_name: all_Xres = [] else: all_Xres = None Xsubset = dsh.make2dimensions(Xsubset) for j in range(self._expected_nbcols): if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame, DataTypes.Serie): Xsubset_j = Xsubset.iloc[:, j] else: Xsubset_j = Xsubset[:, j] if is_fit: sub_model = self._get_model(Xsubset, y) self._models.append(sub_model) else: sub_model = self._models[j] if not self.work_on_one_column_only: Xsubset_j = dsh.make2dimensions(Xsubset_j) if is_fit and is_transform: # fit_transform method Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params) all_Xres.append(Xres_j) elif is_fit and not is_transform: # fit method if self.must_transform_to_get_features_name: Xres_j = sub_model.fit_transform(Xsubset_j, y, **fit_params) all_Xres.append(Xres_j) else: sub_model.fit(Xsubset_j, y, **fit_params) elif is_transform: # transform method Xres_j = sub_model.transform(Xsubset_j) all_Xres.append(Xres_j) if is_fit: self._columns_informations = { "all_output_columns": None if all_Xres is None else [getattr(Xres, "columns", None) for Xres in all_Xres], "all_output_shape": None if all_Xres is None else [getattr(Xres, "shape", None) for Xres in all_Xres], "input_columns": Xsubset_columns, # name of input columns "input_shape": Xsubset_shape, # shape of input data } self._feature_names_for_transform = list( self.try_to_find_feature_names_separate( all_output_columns=self._columns_informations["all_output_columns"], all_output_shape=self._columns_informations["all_output_shape"], input_columns=self._columns_informations["input_columns"], input_shape=self._columns_informations["input_shape"], ) ) # self.kept_features_names = None # for now if is_transform: Xres = dsh.generic_hstack(all_Xres, output_type=self.desired_output_type) Xres = dsh._set_index(Xres, Xindex) if is_transform: if self._feature_names_for_transform is not None: ### LA ca marche pas en transform !!! Xres = dsh._set_columns(Xres, self._feature_names_for_transform) if is_transform: return Xres else: return self
def check_no_null(df_transformed, df=None): df_transformed2 = convert_generic(df_transformed, output_type=DataTypes.DataFrame) assert df_transformed2.isnull().sum().sum() == 0