def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params=None): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) if num_samples(train) == 0 or num_samples(test) == 0: raise RuntimeError( 'Cross validation error in fit_estimator. The total data set ' 'contains %d elements, which were split into a training set ' 'of %d elements and a test set of %d elements. Unfortunately, ' 'you can\'t have a %s set with 0 elements.' % ( num_samples(X), num_samples(train), num_samples(test), 'training' if num_samples(train) == 0 else 'test')) # adjust length of sample weights n_samples = num_samples(X) fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) # fit and score start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) test_score = _score(estimator, X_test, y_test, scorer) train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time msmbuilder_api = is_msmbuilder_estimator(estimator) n_samples_test = num_samples(X_test, is_nested=msmbuilder_api) n_samples_train = num_samples(X_train, is_nested=msmbuilder_api) if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) return (test_score, n_samples_test, train_score, n_samples_train, scoring_time)
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params=None): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # adjust length of sample weights n_samples = _num_samples(X) fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) # fit and score start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) test_score = _score(estimator, X_test, y_test, scorer) train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time mixtape_api = _is_mixtape_estimator(estimator) n_samples_test = _num_samples(X_test, mixtape_api=mixtape_api) n_samples_train = _num_samples(X_train, mixtape_api=mixtape_api) if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) return (test_score, n_samples_test, train_score, n_samples_train, scoring_time)
def _extended_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise', extraOut="auto"): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)" ) else: test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) # Add additional return values extraRVs = {} if extraOut != None: extraRVs["counts"] = {"train":train.shape[0], "test":test.shape[0]} if "estimator" in extraOut: extraRVs["estimator"] = estimator if extraOut == "auto" or "predictions" in extraOut: assert test.shape[0] == X_test.shape[0] probabilities = estimator.predict_proba(X_test) probabilityByIndex = {} for exampleIndex, prediction in zip(test, probabilities): probabilityByIndex[exampleIndex] = prediction extraRVs["probabilities"] = probabilityByIndex if (extraOut == "auto" or "importances" in extraOut) and hasattr(estimator, "feature_importances_"): extraRVs["importances"] = estimator.feature_importances_ ret.append(extraRVs) return ret
def fit(self, X, y): """Fit the RFA model and automatically tune the number of selected features. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vector, where `n_samples` is the number of samples and `n_features` is the total number of features. y : array-like, shape = [n_samples] Target values (integers for classification, real numbers for regression). """ X, y = check_X_y(X, y, "csr") if self.estimator_params is not None: warnings.warn("The parameter 'estimator_params' is deprecated as of version 0.16 " "and will be removed in 0.18. The parameter is no longer " "necessary because the value is set via the estimator initialisation " "or set_params function." , DeprecationWarning) # Initialization rfa = RFA(estimator=self.estimator, n_features_to_select=1, step=self.step, estimator_params=self.estimator_params, verbose=self.verbose - 1) cv = check_cv(self.cv, X, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) scores = np.zeros(X.shape[1]) n_features_to_select_by_rank = np.zeros(X.shape[1]) # Cross-validation for n, (train, test) in enumerate(cv): X_train, y_train = _safe_split(self.estimator, X, y, train) X_test, y_test = _safe_split(self.estimator, X, y, test, train) # Compute a full ranking of the features # ranking_ contains the same set of values for all CV folds, # but perhaps reordered ranking_ = rfa.fit(X_train, y_train).ranking_ # Score each subset of features for k in range(0, np.max(ranking_)): indices = np.where(ranking_ <= k + 1)[0] estimator = clone(self.estimator) estimator.fit(X_train[:, indices], y_train) score = _score(estimator, X_test[:, indices], y_test, scorer) if self.verbose > 0: print("Finished fold with %d / %d feature ranks, score=%f" % (k + 1, np.max(ranking_), score)) scores[k] += score # n_features_to_select_by_rank[k] is being overwritten # multiple times, but by the same value n_features_to_select_by_rank[k] = indices.size # Select the best upper bound for feature rank. It's OK to use the # last ranking_, as np.max(ranking_) is the same over all CV folds. scores = scores[:np.max(ranking_)] k = np.argmax(scores) # Re-execute an elimination with best_k over the whole set rfa = RFA(estimator=self.estimator, n_features_to_select=n_features_to_select_by_rank[k], step=self.step, estimator_params=self.estimator_params) rfa.fit(X, y) # Set final attributes self.support_ = rfa.support_ self.n_features_ = rfa.n_features_ self.ranking_ = rfa.ranking_ self.estimator_ = clone(self.estimator) if self.estimator_params: self.estimator_.set_params(**self.estimator_params) self.estimator_.fit(self.transform(X), y) # Fixing a normalization error, n is equal to len(cv) - 1 # here, the scores are normalized by len(cv) self.grid_scores_ = scores / len(cv) return self
def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, scoring, return_train_score=False, return_parameters=False, error_score='raise'): """ Fit estimator and compute scores for a given dataset split. This overrides the behavior of _fit_and_score method in cross_validation.py. Note that a new argument, scoring, has been added to the function. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. scoring: string The name of the scoring function used in cross_val_score. Default is accuracy. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. Returns ------- train_score : float, optional Score on training set, returned only if `return_train_score` is `True`. test_score : float Score on test set. n_test_samples : int Number of test samples. scoring_time : float Time spent for fitting and scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() x_train, y_train = _safe_split(estimator, X, y, train) x_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: b = estimator.fit(x_train, **fit_params) else: b = estimator.fit(x_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: test_score = _score(estimator, x_test, y_test, scorer) if return_train_score: train_score = _score(estimator, x_train, y_train, scorer) # Addition to original scikit code: # Create FitEvents for each estimator fit. fit_event = FitEvent(b, estimator, x_train) ModelDbSyncer.Syncer.instance.add_to_buffer(fit_event) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(x_test), scoring_time]) # Addition to original scikit code: # Create MetricEvents for each estimator. metric_event = MetricEvent(x_test, estimator, "", "", scoring, test_score) ModelDbSyncer.Syncer.instance.add_to_buffer(metric_event) if return_parameters: ret.append(parameters) return ret
def _extended_fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise', extraOut="auto"): if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn( "Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += " , n=" + str(X_test.shape[0]) + ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) # Add additional return values extraRVs = {} if extraOut != None: extraRVs["counts"] = {"train": train.shape[0], "test": test.shape[0]} if "estimator" in extraOut: extraRVs["estimator"] = estimator if extraOut == "auto" or "predictions" in extraOut: assert test.shape[0] == X_test.shape[0] probabilities = estimator.predict_proba(X_test) probabilityByIndex = {} for exampleIndex, prediction in zip(test, probabilities): probabilityByIndex[exampleIndex] = prediction extraRVs["probabilities"] = probabilityByIndex if (extraOut == "auto" or "importances" in extraOut) and hasattr( estimator, "feature_importances_"): extraRVs["importances"] = estimator.feature_importances_ ret.append(extraRVs) return ret
def _fit_and_score(estimator, X, y, scorer, train, test, cv, verbose, parameters, fit_params, return_train_score=False, return_parameters=False): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scoring : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. train : array-like, shape = (n_train_samples,) Indices of training samples. test : array-like, shape = (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. Returns ------- train_score : float, optional Score on training set, returned only if `return_train_score` is `True`. test_score : float Score on test set. n_test_samples : int Number of test samples. scoring_time : float Time spent for fitting and scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust lenght of sample weights n_samples = _num_samples(X) fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, np.asarray(v)[train] if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) if cv is not None: fit_params["cv"] = cv if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) return ret
def fit_and_score_n_support(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_support=True, error_score='raise'): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. Returns ------- train_score : float, optional Score on training set, returned only if `return_train_score` is `True`. test_score : float Score on test set. n_test_samples : int Number of test samples. scoring_time : float Time spent for fitting and scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = "no parameters to be set" else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): test_score = error_score if return_train_score: train_score = error_score warnings.warn("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)" ) else: test_score = _score(estimator, X_test, y_test, scorer) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer) scoring_time = time.time() - start_time if verbose > 2: msg += ", score=%f" % test_score if verbose > 1: end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_score] if return_train_score else [] ret.extend([test_score, _num_samples(X_test), scoring_time]) if return_parameters: ret.append(parameters) if return_n_support: ret.append(estimator.n_support_) return ret