def test_create_cv(): y = np.array([0] * 10 + [1] * 10) X = np.random.randn(20, 3) cv1 = create_cv(cv=10, y=y, classifier=True) assert cv1.__class__.__name__ == "StratifiedKFold" assert len(list(cv1.split(X, y))) == 10 cv1b = create_cv(cv1) assert cv1b is cv1 y2 = np.random.randn(20) cv2 = create_cv(cv=10, y=y2) assert cv2.__class__.__name__ == "KFold" assert len(list(cv2.split(X, y))) == 10 class PersonalizedCV(object): def __init__(self): pass def split(self, X, y, groups=None): pass cv = PersonalizedCV() cv_res = create_cv(cv) assert cv is cv_res
def get_outsample(self, X, y, method, groups=None, cv=None): """ retrieve 'outsample' prediction using a cross-validation """ if cv is None: cv = self._cv else: cv = create_cv(cv, y, random_state=self.random_state, classifier=self._is_classifier, shuffle=True) ### 1) CV fitting of all models #### all_yhat_pred = [] for model in self.models: yhat_pred = maketwodimensions( cross_val_predict(model, X, y, groups=groups, cv=cv, method=method)) all_yhat_pred.append(yhat_pred) ### 2) concatenate #### all_yhat_pred = np.concatenate(all_yhat_pred, axis=1) return all_yhat_pred
def approx_cross_validation( self, X, y, groups=None, scoring=None, cv=None, verbose=1, fit_params=None, return_predict=False, method=None, no_scoring=False, stopping_round=None, stopping_threshold=None, _save_outsample_predict=False, _use_saved_outsample_predict=False, ): """ cross validation of the blender of the stacker The fold to use to cross-validate the blender are the SAME as the one used to generate 'outsample prediction' """ cv = create_cv(cv, y, classifier=self._is_classifier, shuffle=True, random_state=self.random_state) if _use_saved_outsample_predict: all_yhat_pred = self.all_yhat_pred else: all_yhat_pred = self.get_outsample(X, y, method=self._method, groups=groups, cv=cv) if _save_outsample_predict: self.all_yhat_pred = all_yhat_pred return cross_validation( self.blender, all_yhat_pred, y, scoring=scoring, cv=cv, verbose=verbose, fit_params=fit_params, return_predict=return_predict, method=method, no_scoring=no_scoring, stopping_round=stopping_round, stopping_threshold=stopping_threshold, )
def fit_transform(self, X, y, groups=None): self._already_fitted = True if is_classifier(self.model): self._is_classifier = True elif is_regressor(self.model): self._is_classifier = False else: raise ValueError( "model should either be a Classifier or a Regressor") if self.cv is None: self._cv = None return self.fit(X, y).transform(X) # No CV in that case self._cv = create_cv(self.cv, y, random_state=self.random_state, classifier=self._is_classifier, shuffle=True) if self._is_classifier: predictions = cross_val_predict(self.model, X, y, groups=groups, cv=self._cv, method="predict_proba") else: predictions = cross_val_predict(self.model, X, y, groups=groups, cv=self._cv, method="predict") self.fit(X, y) result = self._format_predictions(predictions, is_classifier=self._is_classifier, target_info=self._target_info) return result
def fit_transform(self, X, y): if y is None: raise ValueError("I need a value for 'y'") if not isinstance(y, pd.Series): sy = pd.Series(y) else: sy = y self.fit(X, sy) X = get_rid_of_categories(X) if self.cv is None: # No Cross Validation ... target_aggregat, target_aggregat_global = self._fit_aggregat( X, y, noise_level=self.noise_level) all_results = self._transform_aggregat(X, target_aggregat, target_aggregat_global) else: cv = create_cv(self.cv, y=sy, classifier=not self.is_regression, random_state=123) all_results = [] for train, test in cv.split(X, y): target_aggregat, target_aggregat_global = self._fit_aggregat( X.iloc[train, :], sy.iloc[train], noise_level=self.noise_level) sub_result = self._transform_aggregat(X.iloc[test, :], target_aggregat, target_aggregat_global) all_results.append(sub_result) all_results = pd.concat(all_results, axis=0) all_results = all_results.loc[X.index, :] assert len(all_results) == len(X) assert (all_results.index == X.index).all() assert all_results.shape[1] == len(self.get_feature_names()) return all_results
def fit(self, X, y, groups=None): self._cv = create_cv(self.cv, y, classifier=self._is_classifier, random_state=self.random_state) all_yhat_pred = self.get_outsample(X, y, method=self._method, groups=groups) N = y.shape[0] assert all_yhat_pred.shape[0] == N ### 3) fit blender #### self.blender.fit(all_yhat_pred, y) ### 4) refit model #### for model in self.models: model.fit(X, y) return self
def approx_cross_validation(self, X, y, groups=None, scoring=None, cv=None, verbose=1, fit_params=None, return_predict=False, method=None, no_scoring=False, stopping_round=None, stopping_threshold=None, nodes_not_to_crossvalidate=None, **kwargs): ################### ### Preparation ### ################### _orig_verbose = self.verbose self.verbose = verbose self._complete_init() if nodes_not_to_crossvalidate is None: nodes_not_to_crossvalidate = set() ################################################################# ### Prepare the list of nodes that can't be 'cv_transformed' #### ################################################################# nodes_cant_cv_transform = set() for node, m in self._models.items(): cant = True if hasattr(m, "can_cv_transform"): if m.can_cv_transform(): cant = False if cant: nodes_cant_cv_transform.add(node) # verif: for node in nodes_cant_cv_transform: if node not in self._models: raise ValueError( "the node (within nodes_cant_cv_transform) %s isn't in the node of the model" % node) for node in nodes_cant_cv_transform: if node not in self._models: raise ValueError( "the node (within nodes_cant_cv_transform) %s isn't in the node of the model" % node) cv = create_cv( cv, y, classifier=sklearn.model_selection._validation.is_classifier(self), shuffle=True, random_state=123) # Split fit_params into a 'step-by-step' dictionnary fit_params_step = {name: {} for name in self.complete_graph.nodes} if fit_params is not None: for key, value in fit_params.items(): step, param = key.split("__", 1) fit_params_step[step][param] = value kwargs_step = {name: {} for name in self.complete_graph.nodes} if kwargs: for key, value in kwargs.items(): step, param = key.split("__", 1) kwargs_step[step][param] = value ################################ ### Pre-calculate everything ### ################################ is_finished, data_dico, result = self._approx_cross_validation_pre_calculation( X=X, y=y, groups=groups, scoring=scoring, cv=cv, verbose=verbose, fit_params_step=fit_params_step, return_predict=return_predict, method=method, no_scoring=no_scoring, stopping_round=stopping_round, stopping_threshold=stopping_threshold, nodes_not_to_crossvalidate=nodes_not_to_crossvalidate, nodes_cant_cv_transform=nodes_cant_cv_transform, kwargs_step=kwargs_step, ) if is_finished: if verbose: print("CV is finished") self.verbose = _orig_verbose return result ########################################################### ### Create a new graphpipeline with the remaining nodes ### ########################################################### new_graph_pipeline, new_data_dtm = self._approx_cross_validation_create_sub_graph_pipeline( data_dico, X) if verbose: print("here is a new GraphPipeline") print(new_graph_pipeline) print("") print("new_data_dtm") print(type(new_data_dtm)) ############################################################################ ### Now do a 'classical cross-validation' on the remaining GraphPipeline ### ############################################################################ result = cross_validation(new_graph_pipeline, new_data_dtm, y, groups=groups, scoring=scoring, cv=cv, verbose=verbose, fit_params=fit_params, return_predict=return_predict, method=method, no_scoring=no_scoring, stopping_round=stopping_round, stopping_threshold=stopping_threshold, approximate_cv=False, **kwargs) self.verbose = _orig_verbose return result
def approx_cross_validation( self, X, y, groups=None, scoring=None, cv=None, verbose=1, fit_params=None, return_predict=False, method="transform", no_scoring=True, stopping_round=None, stopping_threshold=None, ): if is_classifier(self.model): _is_classifier = True elif is_regressor(self.model): _is_classifier = False else: raise ValueError( "model should either be a Classifier or a Regressor") if cv is None: # I'll use cv of stacker raise ValueError("I need a cv do cross-validate") # cv = create_cv(self.cv, y, random_state = self.random_state, classifier = self._is_classifier, shuffle = True) cv = create_cv(cv, y, random_state=123, classifier=is_classifier, shuffle=True) target_info = self._get_target_info(y, is_classifier) if not no_scoring: raise ValueError("no scoring should be True for a transformer") if method != "transform": raise ValueError("method should be 'transform' for a transformer") if _is_classifier: predictions = cross_val_predict(self.model, X, y, groups=groups, cv=cv, method="predict_proba") else: predictions = cross_val_predict(self.model, X, y, groups=groups, cv=cv, method="predict") result = self._format_predictions(predictions, is_classifier=_is_classifier, target_info=target_info) # None : no scoring, this is a transformer return None, result
def approx_cross_validation( self, X, y, groups=None, scoring=None, cv=None, verbose=1, fit_params=None, return_predict=False, method="transform", no_scoring=True, stopping_round=None, stopping_threshold=None, ): if is_classifier(self.model): _is_classifier = True elif is_regressor(self.model): _is_classifier = False else: raise ValueError( "model should either be a Classifier or a Regressor") if cv is None: # I'll use cv of stacker raise ValueError("I need a cv do cross-validate") # cv = create_cv(self.cv, y, random_state = self.random_state, classifier = self._is_classifier, shuffle = True) cv = create_cv(cv, y, random_state=123, classifier=_is_classifier, shuffle=True) if not no_scoring: raise ValueError("no scoring should be True for a transformer") if method != "transform": raise ValueError("method should be 'transform' for a transformer") if _is_classifier: _nby = len(np.unique(y)) if _nby == 2: all_yhat_pred = maketwodimensions( cross_val_predict(self.model, X, y, groups=groups, cv=cv, method="predict_proba")[:, 1]) else: all_yhat_pred = maketwodimensions( cross_val_predict(self.model, X, y, groups=groups, cv=cv, method="predict_proba")) else: all_yhat_pred = maketwodimensions( cross_val_predict(self.model, X, y, groups=groups, cv=cv, method="predict")) # None : no scoring, this is a transformer return None, all_yhat_pred
def fit_transform(self, X, y, groups=None): self._already_fitted = True if is_classifier(self.model): self._is_classifier = True elif is_regressor(self.model): self._is_classifier = False else: raise ValueError( "model should either be a Classifier or a Regressor") if self.cv is None: self._cv = None return self.fit(X, y).transform(X) # No CV in that case else: self._cv = create_cv(self.cv, y, random_state=self.random_state, classifier=self._is_classifier, shuffle=True) if self._is_classifier: self._nby = len(np.unique(y)) if self._nby == 2: all_yhat_pred = maketwodimensions( cross_val_predict(self.model, X, y, groups=groups, cv=self._cv, method="predict_proba")[:, 1]) else: all_yhat_pred = maketwodimensions( cross_val_predict(self.model, X, y, groups=groups, cv=self._cv, method="predict_proba")) else: all_yhat_pred = maketwodimensions( cross_val_predict(self.model, X, y, groups=groups, cv=self._cv, method="predict")) self.model.fit(X, y) if self._is_classifier: # Classification model if self._nby == 2: self._feature_names = [ "%s__%s" % (self.model.__class__.__name__, self.model.classes_[1]) ] else: self._feature_names = [ "%s__%s" % (self.model.__class__.__name__, c) for c in self.model.classes_ ] else: # Regression model self._feature_names = [ "%s__target" % self.model.__class__.__name__ ] if self.columns_prefix is not None: self._feature_names = [ "%s__%s" % (self.columns_prefix, c) for c in self._feature_names ] if hasattr(all_yhat_pred, "columns"): all_yhat_pred.columns = self.get_feature_names() return all_yhat_pred