def test_cross_validation_with_scorer_object_regressor(): np.random.seed(123) X = np.random.randn(100, 10) y = np.random.randn(100) forest = RandomForestRegressor(n_estimators=10, random_state=123) result1 = cross_validation(forest, X, y, scoring=SCORERS["neg_mean_absolute_error"], cv=10) assert result1.shape[0] == 10 assert isinstance(result1, pd.DataFrame) forest = RandomForestRegressor(n_estimators=10, random_state=123) result2 = cross_validation(forest, X, y, scoring="neg_mean_absolute_error", cv=10) assert result2.shape[0] == 10 assert isinstance(result2, pd.DataFrame) assert np.abs(result1.iloc[:, 0] - result2.iloc[:, 0]).max() <= 10**(-5) assert np.abs(result1.iloc[:, 1] - result2.iloc[:, 1]).max() <= 10**(-5)
def test_cross_validation_regressor_multi_output(cast_data_frame): estimator = RandomForestRegressor(n_estimators=10, random_state=123) X, y = make_regression(n_samples=10) yd2 = np.concatenate((y.reshape((-1, 1)), y.reshape((-1, 1))), axis=1) if cast_data_frame: yd2 = pd.DataFrame(yd2) cv_res = cross_validation(estimator, X, yd2, cv=2, scoring="r2") assert cv_res.shape[0] == 2 assert isinstance(cv_res, pd.DataFrame) assert "test_r2" in cv_res.columns assert "train_r2" in cv_res.columns cv_res, yhat = cross_validation(estimator, X, yd2, cv=2, scoring="r2", return_predict=True, method="predict") assert cv_res.shape[0] == 2 assert isinstance(cv_res, pd.DataFrame) assert "test_r2" in cv_res.columns assert "train_r2" in cv_res.columns assert isinstance(yhat, np.ndarray) assert yhat.shape == yd2.shape
def test_cross_validation_with_max_proba_accuracy(): np.random.seed(123) cv = GroupKFold(n_splits=4) max_proba_scorer = _GroupProbaScorer(score_func=max_proba_group_accuracy, sign=1, kwargs={}) X = np.random.randn(100, 10) y = 1 * (np.random.randn(100) > 0) groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) estimator = LogisticRegression(solver="lbfgs", random_state=123) cv_res = cross_validation(estimator, X, y, groups, scoring=max_proba_scorer, cv=cv) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape == (4, 6) cv_res = cross_validation(estimator, X, y, groups, scoring={"mp_acc": max_proba_scorer}, cv=cv) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape == (4, 6) assert "train_mp_acc" in cv_res.columns assert "test_mp_acc" in cv_res.columns assert cv_res["train_mp_acc"].max() <= 1 assert cv_res["train_mp_acc"].min() >= 0 assert cv_res["test_mp_acc"].max() <= 1 assert cv_res["test_mp_acc"].min() >= 0
def test_approx_cross_validation_dummy(approximate_cv): X, y = make_classification(n_samples=100, random_state=123) estimator = DummyModel() cv_res, yhat = cross_validation(estimator, X, y, cv=10, no_scoring=True, return_predict=True, method="predict", approximate_cv=approximate_cv) assert yhat.ndim == 1 assert np.abs(yhat - X[:, 0]).max() <= 10**(-5) estimator = DummyModel() cv_res, yhat = cross_validation(estimator, X, y, cv=10, no_scoring=False, return_predict=True, method="predict", approximate_cv=approximate_cv) assert yhat.ndim == 1 assert np.abs(yhat - X[:, 0]).max() <= 10**(-5)
def test_approx_cross_validation_fit_params(approximate_cv): X, y = make_classification(n_samples=100, random_state=123) estimator = DummyModelCheckFitParams() with pytest.raises(AssertionError): cv_res, yhat = cross_validation( estimator, X, y, cv=10, no_scoring=True, return_predict=True, method="predict", approximate_cv=approximate_cv, ) cv_res, yhat = cross_validation( estimator, X, y, cv=10, no_scoring=True, return_predict=True, method="predict", fit_params={"param": "value"}, approximate_cv=approximate_cv, )
def test_approx_cross_validation_raise_error(approximate_cv): X, y = make_classification(n_samples=100, random_state=123) estimator = DummyModel() with pytest.raises(ValueError): cv_res, yhat = cross_validation( estimator, X, y, cv=10, no_scoring=True, return_predict=False, method="predict", approximate_cv=approximate_cv, ) # no_scoring = True AND return_predict = False => Nothing to do ... error estimator = DummyModel() with pytest.raises(AttributeError): cv_res, yhat = cross_validation( estimator, X, y, cv=10, no_scoring=True, return_predict=True, method="transform", approximate_cv=approximate_cv, )
def test_approx_cross_validation_pass_kwargs(): X, y = make_classification(n_samples=100, random_state=123) estimator = DummyModelWithApprox(check_kwargs=True) with pytest.raises(AssertionError): cv_res, yhat = cross_validation( estimator, X, y, cv=10, no_scoring=True, return_predict=True, method="predict", fit_params={"param": "value"}, approximate_cv=True, ) # error because kwargs not passed cv_res, yhat = cross_validation( estimator, X, y, cv=10, no_scoring=True, return_predict=True, method="predict", fit_params={"param": "value"}, kwargs_param="kwargs_value", approximate_cv=True, )
def test_cross_validation_with_scorer_object_classifier(): X = np.random.randn(100, 10) y = np.array(["A"] * 33 + ["B"] * 33 + ["C"] * 34) forest = RandomForestClassifier(n_estimators=10, random_state=123) result1 = cross_validation(forest, X, y, scoring=SCORERS["accuracy"], cv=10) assert result1.shape[0] == 10 assert isinstance(result1, pd.DataFrame) result2 = cross_validation(forest, X, y, scoring="accuracy", cv=10) assert result2.shape[0] == 10 assert isinstance(result2, pd.DataFrame) assert np.abs(result1.iloc[:, 0] - result2.iloc[:, 0]).max() <= 10 ** (-5) assert np.abs(result1.iloc[:, 1] - result2.iloc[:, 1]).max() <= 10 ** (-5) result1 = cross_validation(forest, X, y, scoring=SCORERS["neg_log_loss"], cv=10) assert result1.shape[0] == 10 assert isinstance(result1, pd.DataFrame) result2 = cross_validation(forest, X, y, scoring="neg_log_loss", cv=10) assert result2.shape[0] == 10 assert isinstance(result2, pd.DataFrame) assert np.abs(result1.iloc[:, 0] - result2.iloc[:, 0]).max() <= 10 ** (-5) assert np.abs(result1.iloc[:, 1] - result2.iloc[:, 1]).max() <= 10 ** (-5)
def test_approx_cross_validation_transformer(x_data_type, shuffle, graph_pipeline, with_groups): if graph_pipeline: estimator = GraphPipeline({"ptA": DebugPassThrough(), "ptB": DebugPassThrough()}, edges=[("ptA", "ptB")]) else: estimator = DebugPassThrough() X, y = make_classification(n_samples=100, random_state=123) if with_groups: groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) else: groups = None X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if shuffle: np.random.seed(123) ii = np.arange(X.shape[0]) np.random.shuffle(ii) y = y[ii] if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] scoring = ["accuracy", "neg_log_loss"] ################## ### Score only ### ################## with pytest.raises(Exception): cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0) # shouldn't work since DebugPassThrough can't be scored ################# ### Transform ### ################# cv_res, Xhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, no_scoring=True ) assert type(Xhat) == type(X) assert cv_res is None assert Xhat.shape == X.shape if isinstance(X, pd.DataFrame): assert (Xhat.index == X.index).all() assert (Xhat.columns == X.columns).all() if isinstance(X, pd.DataFrame): assert np.abs(Xhat - X).max().max() <= 10 ** (10 - 10) else: assert np.max(np.abs(Xhat - X)) <= 10 ** (-10)
def test_cross_validation_classifier_multi_output(add_third_class, cast_data_frame, cast_string): estimator = RandomForestClassifier(n_estimators=10, random_state=123) X, y = make_classification(n_samples=10) yd2 = np.concatenate((y.reshape((-1, 1)), y.reshape((-1, 1))), axis=1) if add_third_class: yd2[0, 1] = 2 if cast_string: yd2 = yd2.astype("str").astype("object") yd2[:, 0] = "cl_a_" + yd2[:, 0] yd2[:, 1] = "cl_b_" + yd2[:, 1] if cast_data_frame: yd2 = pd.DataFrame(yd2) cv_res = cross_validation(estimator, X, yd2, cv=3, scoring="log_loss_patched") assert cv_res.shape[0] == 3 assert isinstance(cv_res, pd.DataFrame) assert "test_log_loss_patched" in cv_res.columns assert "train_log_loss_patched" in cv_res.columns cv_res, yhat = cross_validation( estimator, X, yd2, cv=3, scoring="log_loss_patched", return_predict=True, method="predict" ) assert cv_res.shape[0] == 3 assert isinstance(cv_res, pd.DataFrame) assert "test_log_loss_patched" in cv_res.columns assert "train_log_loss_patched" in cv_res.columns assert isinstance(yhat, np.ndarray) assert yhat.shape == yd2.shape cv_res, yhat_proba = cross_validation( estimator, X, yd2, cv=3, scoring="log_loss_patched", return_predict=True, method="predict_proba" ) assert cv_res.shape[0] == 3 assert isinstance(cv_res, pd.DataFrame) assert "test_log_loss_patched" in cv_res.columns assert "train_log_loss_patched" in cv_res.columns assert isinstance(yhat_proba, list) assert len(yhat_proba) == 2 for j, p in enumerate(yhat_proba): assert p.shape == (yd2.shape[0], 2 + 1 * (j == 1) * (add_third_class)) assert (p.sum(axis=1) - 1).abs().max() <= 10 ** (-10) assert isinstance(p, pd.DataFrame) assert p.min().min() >= 0 assert p.max().max() <= 1 if cast_data_frame: assert list(p.columns) == list(np.sort(np.unique(yd2.iloc[:, j]))) else: assert list(p.columns) == list(np.sort(np.unique(yd2[:, j])))
def test_cross_validation_few_sample_per_classes(with_groups): np.random.seed(123) X = np.random.randn(100, 2) y = np.array(["AA"] * 33 + ["BB"] * 33 + ["CC"] * 33 + ["DD"]) if with_groups: groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) else: groups = None cv = StratifiedKFold(n_splits=10) logit = LogisticRegression() _, yhat_proba = cross_validation(logit, X, y, groups=groups, cv=cv, return_predict=True, no_scoring=True) assert (yhat_proba.max(axis=1) > 0).all() assert yhat_proba.shape == (100, 4) assert list(yhat_proba.columns) == ["AA", "BB", "CC", "DD"]
def test_approx_cross_validation_cv(approximate_cv): X, y = make_classification() cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=123) estimator = DebugPassThrough() cv_res, yhat = cross_validation( estimator, X, y, groups=None, cv=cv, verbose=1, fit_params={}, return_predict=True, method="transform", no_scoring=True, stopping_round=None, stopping_threshold=None, approximate_cv=approximate_cv, ) assert cv_res is None assert yhat.ndim == 2 assert yhat.shape == X.shape
def test_approx_cross_validation_pass_to_method(): X, y = make_classification(n_samples=100, random_state=123) estimator = DummyModelWithApprox() cv_res, yhat = cross_validation( estimator, X, y, cv=10, no_scoring=True, return_predict=True, method="predict", approximate_cv=True ) assert cv_res is None assert yhat.ndim == 1 assert np.abs(yhat - X[:, 1]).max() <= 10 ** (-5) estimator = DummyModelWithApprox() cv_res, yhat = cross_validation( estimator, X, y, cv=10, no_scoring=False, return_predict=True, method="predict", approximate_cv=True ) assert cv_res is not None assert "scoring" in cv_res assert yhat.ndim == 1 assert np.abs(yhat - X[:, 1]).max() <= 10 ** (-5) estimator = DummyModelWithApprox() cv_res = cross_validation( estimator, X, y, cv=10, no_scoring=False, return_predict=False, method="predict", approximate_cv=True ) assert cv_res is not None assert "scoring" in cv_res assert yhat.ndim == 1 assert np.abs(yhat - X[:, 1]).max() <= 10 ** (-5) estimator = DummyModelWithApprox() cv_res = cross_validation( estimator, X, y, cv=10, scoring=["neg_mean_squared_error"], no_scoring=False, return_predict=False, method="predict", approximate_cv=True, ) assert cv_res is not None assert "scoring" in cv_res assert cv_res["scoring"] == ["neg_mean_squared_error"]
def test_cross_validation_time_serie_split(): X, y = make_classification(n_samples=100, random_state=123) cv = TimeSeriesSplit(n_splits=10) model = RandomForestClassifier(n_estimators=10, random_state=123) cv_res, yhat = cross_validation(model, X, y, cv=cv, return_predict=True) assert yhat is None # because I can't return predictions assert len(cv_res) == 10 assert isinstance(cv_res, pd.DataFrame)
def approx_cross_validation( self, X, y, groups=None, scoring=None, cv=None, verbose=1, fit_params=None, return_predict=False, method=None, no_scoring=False, stopping_round=None, stopping_threshold=None, _save_outsample_predict=False, _use_saved_outsample_predict=False, ): """ cross validation of the blender of the stacker The fold to use to cross-validate the blender are the SAME as the one used to generate 'outsample prediction' """ cv = create_cv(cv, y, classifier=self._is_classifier, shuffle=True, random_state=self.random_state) if _use_saved_outsample_predict: all_yhat_pred = self.all_yhat_pred else: all_yhat_pred = self.get_outsample(X, y, method=self._method, groups=groups, cv=cv) if _save_outsample_predict: self.all_yhat_pred = all_yhat_pred return cross_validation( self.blender, all_yhat_pred, y, scoring=scoring, cv=cv, verbose=verbose, fit_params=fit_params, return_predict=return_predict, method=method, no_scoring=no_scoring, stopping_round=stopping_round, stopping_threshold=stopping_threshold, )
def test_cross_validation_passing_of_groups(): np.random.seed(123) X = np.random.randn(100, 10) y = np.random.randn(100) groups = np.random.randint(0, 20, size=100) estimator = TransformerFailNoGroups() cv_res, yhat = cross_validation(estimator, X, y, groups, cv=10, no_scoring=True, return_predict=True) # Check that it doesn't fail : meaning the estimator has access to the groups assert cv_res is None assert (yhat == X).all()
def test_RandomTrainTestCv_fail_with_cross_val_predict(): np.random.seed(123) X = np.random.randn(100, 10) y = np.random.randn(100) cv = RandomTrainTestCv(test_size=0.1, random_state=123) estimator = DecisionTreeRegressor(max_depth=2, random_state=123) with pytest.raises(ValueError): cross_val_predict(estimator, X, y, cv=cv) res = cross_validation(estimator, X, y, cv=cv, no_scoring=True, return_predict=True) assert res == (None, None)
def test_cross_validation_sample_weight(): X, y = make_classification(n_samples=100, random_state=123) sample_weight = np.ones(y.shape[0]) estimator = DummyModelCheckSampleWeight() estimator.fit(X, y, sample_weight=sample_weight) cv_res, yhat = cross_validation( estimator, X, y, cv=10, no_scoring=True, return_predict=True, method="predict", fit_params={"sample_weight":sample_weight} ) # I just need to check that it works assert yhat.shape[0] == y.shape[0] estimator = DummyModelCheckSampleWeight() estimator.fit(X, y) cv_res, yhat = cross_validation( estimator, X, y, cv=10, no_scoring=True, return_predict=True, method="predict" ) # I just need to check that it works assert yhat.shape[0] == y.shape[0]
def approx_cross_validation(self, X, y, groups=None, scoring=None, cv=None, verbose=1, fit_params=None, return_predict=False, method=None, no_scoring=False, stopping_round=None, stopping_threshold=None, nodes_not_to_crossvalidate=None, **kwargs): ################### ### Preparation ### ################### _orig_verbose = self.verbose self.verbose = verbose self._complete_init() if nodes_not_to_crossvalidate is None: nodes_not_to_crossvalidate = set() ################################################################# ### Prepare the list of nodes that can't be 'cv_transformed' #### ################################################################# nodes_cant_cv_transform = set() for node, m in self._models.items(): cant = True if hasattr(m, "can_cv_transform"): if m.can_cv_transform(): cant = False if cant: nodes_cant_cv_transform.add(node) # verif: for node in nodes_cant_cv_transform: if node not in self._models: raise ValueError( "the node (within nodes_cant_cv_transform) %s isn't in the node of the model" % node) for node in nodes_cant_cv_transform: if node not in self._models: raise ValueError( "the node (within nodes_cant_cv_transform) %s isn't in the node of the model" % node) cv = create_cv( cv, y, classifier=sklearn.model_selection._validation.is_classifier(self), shuffle=True, random_state=123) # Split fit_params into a 'step-by-step' dictionnary fit_params_step = {name: {} for name in self.complete_graph.nodes} if fit_params is not None: for key, value in fit_params.items(): step, param = key.split("__", 1) fit_params_step[step][param] = value kwargs_step = {name: {} for name in self.complete_graph.nodes} if kwargs: for key, value in kwargs.items(): step, param = key.split("__", 1) kwargs_step[step][param] = value ################################ ### Pre-calculate everything ### ################################ is_finished, data_dico, result = self._approx_cross_validation_pre_calculation( X=X, y=y, groups=groups, scoring=scoring, cv=cv, verbose=verbose, fit_params_step=fit_params_step, return_predict=return_predict, method=method, no_scoring=no_scoring, stopping_round=stopping_round, stopping_threshold=stopping_threshold, nodes_not_to_crossvalidate=nodes_not_to_crossvalidate, nodes_cant_cv_transform=nodes_cant_cv_transform, kwargs_step=kwargs_step, ) if is_finished: if verbose: print("CV is finished") self.verbose = _orig_verbose return result ########################################################### ### Create a new graphpipeline with the remaining nodes ### ########################################################### new_graph_pipeline, new_data_dtm = self._approx_cross_validation_create_sub_graph_pipeline( data_dico, X) if verbose: print("here is a new GraphPipeline") print(new_graph_pipeline) print("") print("new_data_dtm") print(type(new_data_dtm)) ############################################################################ ### Now do a 'classical cross-validation' on the remaining GraphPipeline ### ############################################################################ result = cross_validation(new_graph_pipeline, new_data_dtm, y, groups=groups, scoring=scoring, cv=cv, verbose=verbose, fit_params=fit_params, return_predict=return_predict, method=method, no_scoring=no_scoring, stopping_round=stopping_round, stopping_threshold=stopping_threshold, approximate_cv=False, **kwargs) self.verbose = _orig_verbose return result
def test_approx_cross_validation_early_stop( add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups ): X, y = make_classification(n_samples=100, random_state=123) if with_groups: groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) else: groups = None if add_third_class: y[0:2] = 2 X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if shuffle: np.random.seed(123) ii = np.arange(X.shape[0]) np.random.shuffle(ii) y = y[ii] if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] if y_string_class: y = np.array(["CL_%d" % i for i in y]) if add_third_class: scoring = ["accuracy"] else: scoring = ["accuracy", "neg_log_loss"] if graph_pipeline: estimator = GraphPipeline( {"pt": DebugPassThrough(), "lg": LogisticRegression(C=1, random_state=123)}, edges=[("pt", "lg")] ) else: estimator = LogisticRegression(C=1, random_state=123) cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict", stopping_round=1, stopping_threshold=1.01, # So that accuracy is sure to be bellow ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 2 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert yhat is None cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict", stopping_round=1, stopping_threshold=0.0, ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert yhat.ndim == 1 assert len(np.setdiff1d(yhat, y)) == 0
def test_cross_validation(add_third_class, x_data_type, y_string_class, shuffle, graph_pipeline, with_groups): X, y = make_classification(n_samples=100, random_state=123) if with_groups: groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) else: groups = None X = convert_generic(X, output_type=x_data_type) if x_data_type == DataTypes.DataFrame: X.columns = ["col_%d" % i for i in range(X.shape[1])] if add_third_class: y[0:2] = 2 if shuffle: np.random.seed(123) ii = np.arange(X.shape[0]) np.random.shuffle(ii) y = y[ii] if isinstance(X, pd.DataFrame): X = X.loc[ii, :] else: X = X[ii, :] if y_string_class: y = np.array(["CL_%d" % i for i in y]) if add_third_class: scoring = ["accuracy"] else: scoring = ["accuracy", "neg_log_loss"] if graph_pipeline: estimator = GraphPipeline({"pt": DebugPassThrough(), "lg": LogisticRegression()}, edges=[("pt", "lg")]) else: estimator = LogisticRegression() ################## ### Only score ### ################## cv_res = cross_validation(estimator, X, y, groups, cv=10, scoring=scoring, verbose=0) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) with pytest.raises(NotFittedError): estimator.predict(X) ##################### ### Score + Proba ### ##################### cv_res, yhat_proba = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert isinstance(yhat_proba, pd.DataFrame) if isinstance(X, pd.DataFrame): assert (yhat_proba.index == X.index).all() assert yhat_proba.shape == (y.shape[0], 2 + 1 * add_third_class) assert yhat_proba.min().min() >= 0 assert yhat_proba.max().max() <= 1 assert list(yhat_proba.columns) == list(np.sort(np.unique(y))) with pytest.raises(NotFittedError): estimator.predict(X) ####################### ### Score + Predict ### ####################### cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict" ) assert isinstance(cv_res, pd.DataFrame) assert cv_res.shape[0] == 10 for s in scoring: assert ("test_" + s) in set(cv_res.columns) assert ("train_" + s) in set(cv_res.columns) assert yhat.ndim == 1 assert len(np.setdiff1d(yhat, y)) == 0 assert yhat.shape[0] == y.shape[0] with pytest.raises(NotFittedError): estimator.predict(X) #################### ### Predict only ### #################### cv_res, yhat = cross_validation( estimator, X, y, groups, cv=10, scoring=scoring, verbose=0, return_predict=True, method="predict", no_scoring=True, ) assert yhat.shape[0] == y.shape[0] assert cv_res is None assert yhat.ndim == 1 assert len(np.setdiff1d(yhat, y)) == 0 with pytest.raises(NotFittedError): estimator.predict(X)
def _approx_cross_validation_pre_calculation( self, X, y, groups, scoring, cv, verbose, fit_params_step, return_predict, method, no_scoring, stopping_round, stopping_threshold, nodes_not_to_crossvalidate, nodes_cant_cv_transform, kwargs_step, ): """ sub-method to loop through the nodes of the pipeline and pre-compute everything that can be pre-computed """ data_dico = {} # Will contain transformed blocks at each node nodes_done = set() for node in self._nodes_order: concat_at_this_node = self.no_concat_nodes is None or node not in self.no_concat_nodes if not concat_at_this_node: raise NotImplementedError( "Approx cross-validation does't work if no concatenation (node %s)" % str(node)) nodes_done.add(node) if self.verbose: print("start processing node %s ..." % node) ### Debugging Help ### # if getattr(self,"_return_before_node",None) is not None and getattr(self,"_return_before_node",None) == node: # return data_dico model = self._models[node] predecessors = list(self.complete_graph.predecessors(node)) # Carefull : here it is not necessary always in the same order #### I'll use the order in which the edges were given # Concatenation : alphabetical order if len(predecessors) == 0: ######################### ### No predecessors ### ######################### # ==> Apply on original data lastX = X elif len(predecessors) == 1: ######################## ### One predecessor ### ######################## # ==> Apply on data coming out of last node lastX = data_dico[predecessors[0]] # data_dico[node] = model.fit_transform(lastX, y, **fit_params_step[node] ) elif len(predecessors) > 1: ####################### ### More than one ### ####################### # ==> concat all the predecessors node and apply it ### Fix concatenation order ### edges_number = self._get_edges_number(predecessors, node) predecessors = sorted(predecessors, key=lambda p: (edges_number.get(p, -1), p)) self._all_concat_order[node] = predecessors all_lastX = [ data_dico[predecessor] for predecessor in predecessors ] if self.verbose: print("start aggregation...") # if do_fit: output_type = guess_output_type(all_lastX) self._all_concat_type[node] = output_type # else: # output_type = self._all_concat_type[node] has_none = False for x in all_lastX: if x is None: has_none = True break # None in all_lastX if has_none: lastX = None else: lastX = generic_hstack(all_lastX, output_type=output_type) if node != self._terminal_node and lastX is not None: # This is not the end of the graph if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform: ### 1) Node should BE crossvalitaded ... ### 2) ... and we CAN use 'cv_transform' if self.verbose: print("do crossvalidation on %s" % node) _, data_dico[node] = cross_validation( model, lastX, y, groups=groups, cv=cv, verbose=verbose, fit_params=fit_params_step[node], return_predict=True, method="transform", no_scoring=True, stopping_round=None, stopping_threshold=None, **kwargs_step[node]) elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform: ### 1) Node should BE crossvalitated ... ### 2) ... but we can't use 'cv_transform' if self.verbose: print("can't do node %s" % node) data_dico[node] = None # Can't compute this node else: ### Node that shouldn't be cross-validated ### if self.verbose: print("skip crossvalidation on %s" % node) cloned_model = clone(model) if groups is not None and function_has_named_argument( cloned_model.fit_transform, "groups"): data_dico[node] = cloned_model.fit_transform( lastX, y, groups, **fit_params_step[node]) else: data_dico[node] = cloned_model.fit_transform( lastX, y, **fit_params_step[node]) elif lastX is not None: ### CV no matter what at the last node ### # if node not in nodes_not_to_crossvalidate and node not in nodes_cant_cv_transform: # # # This is the last node of the Graph # result = approx_cross_validation( model, lastX, y, groups = groups, scoring = scoring, cv = cv , # verbose = verbose, fit_params = fit_params_step[node], # return_predict = return_predict , method = method, no_scoring = no_scoring, # stopping_round = stopping_round, stopping_threshold = stopping_threshold, # **kwargs_step[node]) # # elif node not in nodes_not_to_crossvalidate and node in nodes_cant_cv_transform: # pass # # else: # This is the last node of the Graph result = cross_validation( model, lastX, y, groups=groups, scoring=scoring, cv=cv, verbose=verbose, fit_params=fit_params_step[node], return_predict=return_predict, method=method, no_scoring=no_scoring, stopping_round=stopping_round, stopping_threshold=stopping_threshold, **kwargs_step[node]) # Rmk : if we do that so column regarding the time of fit are 'false' : they will only account for the time spent in the last node return True, data_dico, result # return result else: ### if self.verbose: print("can't compute node %s because lastX is None" % node) data_dico[node] = None # return result return False, data_dico, None # None : no result yet
def test_cross_validation0(with_groups): np.random.seed(123) X = np.random.randn(100, 10) y = np.random.randn(100) if with_groups: groups = np.array([0] * 25 + [1] * 25 + [2] * 25 + [3] * 25) else: groups = None forest = RandomForestRegressor(n_estimators=10) result = cross_validation(forest, X, y, groups=groups, scoring=["neg_mean_squared_error", "r2"], cv=10) with pytest.raises(sklearn.exceptions.NotFittedError): forest.predict(X) assert isinstance(result, pd.DataFrame) assert list(result.columns) == [ "test_neg_mean_squared_error", "test_r2", "train_neg_mean_squared_error", "train_r2", "fit_time", "score_time", "n_test_samples", "fold_nb", ] assert len(result) == 10 forest = RandomForestRegressor(n_estimators=10, random_state=123) result, yhat = cross_validation( forest, X, y, groups, scoring=["neg_mean_squared_error", "r2"], cv=10, return_predict=True ) with pytest.raises(sklearn.exceptions.NotFittedError): forest.predict(X) assert isinstance(result, pd.DataFrame) assert list(result.columns) == [ "test_neg_mean_squared_error", "test_r2", "train_neg_mean_squared_error", "train_r2", "fit_time", "score_time", "n_test_samples", "fold_nb", ] assert len(result) == 10 assert yhat.shape == (100,) X = np.random.randn(100, 10) y = np.array(["A"] * 33 + ["B"] * 33 + ["C"] * 34) forest = RandomForestClassifier(n_estimators=10, random_state=123) result = cross_validation(forest, X, y, groups, scoring=["accuracy", "neg_log_loss"], cv=10) with pytest.raises(sklearn.exceptions.NotFittedError): forest.predict(X) assert isinstance(result, pd.DataFrame) assert list(result.columns) == [ "test_accuracy", "test_neg_log_loss", "train_accuracy", "train_neg_log_loss", "fit_time", "score_time", "n_test_samples", "fold_nb", ] assert len(result) == 10 forest = RandomForestClassifier(random_state=123, n_estimators=10) result, yhat = cross_validation( forest, X, y, groups, scoring=["accuracy", "neg_log_loss"], cv=10, return_predict=True, method="predict" ) with pytest.raises(sklearn.exceptions.NotFittedError): forest.predict(X) assert yhat.shape == (100,) assert set(np.unique(yhat)) == set(("A", "B", "C")) forest = RandomForestClassifier(random_state=123, n_estimators=10) result, yhat = cross_validation( forest, X, y, groups, scoring=["accuracy", "neg_log_loss"], cv=10, return_predict=True, method="predict_proba" ) with pytest.raises(sklearn.exceptions.NotFittedError): forest.predict(X) assert yhat.shape == (100, 3) assert isinstance(yhat, pd.DataFrame) assert list(yhat.columns) == ["A", "B", "C"]