def test_thresholded_scorers(): # Test scorers that take thresholds. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = get_scorer('log_loss')(clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # test with a regressor (no decision_function) reg = DecisionTreeRegressor() reg.fit(X_train, y_train) score1 = get_scorer('roc_auc')(reg, X_test, y_test) score2 = roc_auc_score(y_test, reg.predict(X_test)) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) assert_raises(ValueError, get_scorer('roc_auc'), clf, X_test, y_test)
def test_thresholded_scorers(): # Test scorers that take thresholds. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = get_scorer('neg_log_loss')(clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # test with a regressor (no decision_function) reg = DecisionTreeRegressor() reg.fit(X_train, y_train) score1 = get_scorer('roc_auc')(reg, X_test, y_test) score2 = roc_auc_score(y_test, reg.predict(X_test)) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) with pytest.raises(ValueError, match="multiclass format is not supported"): get_scorer('roc_auc')(clf, X_test, y_test) # test error is raised with a single class present in model # (predict_proba shape is not suitable for binary auc) X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = DecisionTreeClassifier() clf.fit(X_train, np.zeros_like(y_train)) with pytest.raises(ValueError, match="need classifier with two classes"): get_scorer('roc_auc')(clf, X_test, y_test) # for proba scorers with pytest.raises(ValueError, match="need classifier with two classes"): get_scorer('neg_log_loss')(clf, X_test, y_test)
def __init__(self, estimator, k_features=1, forward=True, floating=False, verbose=0, scoring=None, cv=5, skip_if_stuck=True, n_jobs=1, pre_dispatch='2*n_jobs', clone_estimator=True): self.estimator = estimator self.k_features = k_features self.forward = forward self.floating = floating self.pre_dispatch = pre_dispatch self.scoring = scoring if isinstance(scoring, str): self.scorer = get_scorer(scoring) else: self.scorer = scoring self.skip_if_stuck = skip_if_stuck self.cv = cv self.n_jobs = n_jobs self.verbose = verbose self.named_est = {key: value for key, value in _name_estimators([self.estimator])} self.clone_estimator = clone_estimator if self.clone_estimator: self.est_ = clone(self.estimator) else: self.est_ = self.estimator self.fitted = False self.subsets_ = {} self.interrupted_ = False # don't mess with this unless testing self._TESTING_INTERRUPT_MODE = False
def __init__(self, estimator, min_features=1, max_features=1, print_progress=True, scoring='accuracy', cv=5, n_jobs=1, pre_dispatch='2*n_jobs', clone_estimator=True): self.estimator = estimator self.min_features = min_features self.max_features = max_features self.pre_dispatch = pre_dispatch self.scoring = scoring self.scorer = get_scorer(scoring) self.cv = cv self.print_progress = print_progress self.n_jobs = n_jobs self.named_est = {key: value for key, value in _name_estimators([self.estimator])} self.clone_estimator = clone_estimator if self.clone_estimator: self.est_ = clone(self.estimator) else: self.est_ = self.estimator self.fitted = False self.interrupted_ = False # don't mess with this unless testing self._TESTING_INTERRUPT_MODE = False
def clf_bias_var(clf, X, y, n_replicas): roc_auc_scorer = get_scorer("roc_auc") # roc_auc_scorer(clf, X_test, y_test) auc_scores = [] error_scores = [] counts = np.zeros(X.shape[0], dtype = np.float64) sum_preds = np.zeros(X.shape[0], dtype = np.float64) for it in xrange(n_replicas): # generate train sets and test sets train_indices = np.random.randint(X.shape[0], size = X.shape[0]) # get test sets in_train = np.unique(train_indices) mask = np.ones(X.shape[0], dtype = np.bool) mask[in_train] = False test_indices = np.arange(X.shape[0])[mask] clf.fit(X[train_indices], y[train_indices]) auc_scores.append(roc_auc_scorer(clf, X[test_indices], y[test_indices])) error_scores.append(zero_one_loss(y[test_indices], clf.predict(X[test_indices]))) preds = clf.predict(X) for index in test_indices: counts[index] += 1 sum_preds[index] += preds[index] test_mask = (counts > 0) # indices of samples that have been tested # print('counts mean: {}'.format(np.mean(counts))) # print('counts standard derivation: {}'.format(np.std(counts))) bias, var = bias_var(y[test_mask], sum_preds[test_mask], counts[test_mask], n_replicas) return auc_scores, error_scores, bias, var
def _make_scorer(scoring): """Make scorer. Parameters ---------- scoring : str | callable If str, must be compatible with sklearn sklearn's get_scorer. If callable, function with signature ``score_func(y, y_pred, **kwargs)``. Returns ------- scorer : callable | None The scorer. """ from sklearn.metrics import make_scorer, get_scorer # If scoring is None (default), the predictions are internally # generated by estimator.score(). Else, we must first get the # predictions based on the scorer. if scoring is None: return None elif isinstance(scoring, str): return get_scorer(scoring) else: return make_scorer(scoring)
def __init__(self, estimator, k_features, forward=True, floating=False, print_progress=True, scoring='accuracy', cv=5, skip_if_stuck=True, n_jobs=1, pre_dispatch='2*n_jobs', clone_estimator=True): self.estimator = estimator self.k_features = k_features self.forward = forward self.floating = floating self.pre_dispatch = pre_dispatch self.scoring = scoring self.scorer = get_scorer(scoring) self.skip_if_stuck = skip_if_stuck self.cv = cv self.print_progress = print_progress self.n_jobs = n_jobs self.named_est = {key: value for key, value in _name_estimators([self.estimator])} self.clone_estimator = clone_estimator if self.clone_estimator: self.est_ = clone(self.estimator) else: self.est_ = self.estimator self.fitted = False
def __init__(self, n_jobs=-1, offset_scale=1.0, n_buckets=2, initial_params=None, minimizer='BFGS', basinhopping=False, scoring='accuracy'): from numpy import array self.n_jobs = int(n_jobs) self.offset_scale = float(offset_scale) self.n_buckets = int(n_buckets) if initial_params is None: #self.initial_offsets_ = [-0.5] * self.n_buckets pass else: self.params = array(initial_params) #assert(len(self.initial_offsets_) == self.n_buckets) pass self.minimizer = minimizer self.basinhopping = basinhopping from sklearn.metrics import get_scorer self.scoring = get_scorer(scoring) pass
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = get_scorer("accuracy") rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_))) # Same as the first two tests, but with step=2 rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) rfecv.fit(X, y) assert_equal(len(rfecv.grid_scores_), 6) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data)
def test_classification_scores(): # Test classification scorers. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LinearSVC(random_state=0) clf.fit(X_train, y_train) for prefix, metric in [('f1', f1_score), ('precision', precision_score), ('recall', recall_score), ('jaccard', jaccard_score)]: score1 = get_scorer('%s_weighted' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=None, average='weighted') assert_almost_equal(score1, score2) score1 = get_scorer('%s_macro' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=None, average='macro') assert_almost_equal(score1, score2) score1 = get_scorer('%s_micro' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=None, average='micro') assert_almost_equal(score1, score2) score1 = get_scorer('%s' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=1) assert_almost_equal(score1, score2) # test fbeta score that takes an argument scorer = make_scorer(fbeta_score, beta=2) score1 = scorer(clf, X_test, y_test) score2 = fbeta_score(y_test, clf.predict(X_test), beta=2) assert_almost_equal(score1, score2) # test that custom scorer can be pickled unpickled_scorer = pickle.loads(pickle.dumps(scorer)) score3 = unpickled_scorer(clf, X_test, y_test) assert_almost_equal(score1, score3) # smoke test the repr: repr(fbeta_score)
def test_supervised_cluster_scorers(): # Test clustering scorers against gold standard labeling. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) km = KMeans(n_clusters=3) km.fit(X_train) for name in CLUSTER_SCORERS: score1 = get_scorer(name)(km, X_test, y_test) score2 = getattr(cluster_module, name)(y_test, km.predict(X_test)) assert_almost_equal(score1, score2)
def test_regression_scorers(): # Test regression scorers. diabetes = load_diabetes() X, y = diabetes.data, diabetes.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = Ridge() clf.fit(X_train, y_train) score1 = get_scorer('r2')(clf, X_test, y_test) score2 = r2_score(y_test, clf.predict(X_test)) assert_almost_equal(score1, score2)
def test_unsupervised_scorers(): # Test clustering scorers against gold standard labeling. # We don't have any real unsupervised Scorers yet. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) km = KMeans(n_clusters=3) km.fit(X_train) score1 = get_scorer('adjusted_rand_score')(km, X_test, y_test) score2 = adjusted_rand_score(y_test, km.predict(X_test)) assert_almost_equal(score1, score2)
def test_thresholded_scorers_multilabel_indicator_data(): """Test that the scorer work with multilabel-indicator format for multilabel and multi-output multi-class classifier """ X, y = make_multilabel_classification(return_indicator=True, allow_unlabeled=False, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # Multi-output multi-class predict_proba clf = DecisionTreeClassifier() clf.fit(X_train, y_train) y_proba = clf.predict_proba(X_test) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, np.vstack(p[:, -1] for p in y_proba).T) assert_almost_equal(score1, score2) # Multi-output multi-class decision_function # TODO Is there any yet? clf = DecisionTreeClassifier() clf.fit(X_train, y_train) clf._predict_proba = clf.predict_proba clf.predict_proba = None clf.decision_function = lambda X: [p[:, 1] for p in clf._predict_proba(X)] y_proba = clf.decision_function(X_test) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, np.vstack(p for p in y_proba).T) assert_almost_equal(score1, score2) # Multilabel predict_proba clf = OneVsRestClassifier(DecisionTreeClassifier()) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)) assert_almost_equal(score1, score2) # Multilabel decision function clf = OneVsRestClassifier(LinearSVC(random_state=0)) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) assert_almost_equal(score1, score2)
def test_classification_scores(): """Test classification scorers.""" X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LinearSVC(random_state=0) clf.fit(X_train, y_train) for prefix, metric in [('f1', f1_score), ('precision', precision_score), ('recall', recall_score)]: score1 = get_scorer('%s_weighted' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=None, average='weighted') assert_almost_equal(score1, score2) score1 = get_scorer('%s_macro' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=None, average='macro') assert_almost_equal(score1, score2) score1 = get_scorer('%s_micro' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=None, average='micro') assert_almost_equal(score1, score2) score1 = get_scorer('%s' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=1) assert_almost_equal(score1, score2) # test fbeta score that takes an argument scorer = make_scorer(fbeta_score, beta=2) score1 = scorer(clf, X_test, y_test) score2 = fbeta_score(y_test, clf.predict(X_test), beta=2, average='weighted') assert_almost_equal(score1, score2) # test that custom scorer can be pickled unpickled_scorer = pickle.loads(pickle.dumps(scorer)) score3 = unpickled_scorer(clf, X_test, y_test) assert_almost_equal(score1, score3) # smoke test the repr: repr(fbeta_score)
def evaluate(self, dataset, pipelines): if not self.is_valid(dataset): raise AssertionError("Dataset is not appropriate for evaluation") for subject in dataset.subject_list: # check if we already have result for this subject/pipeline # we might need a better granularity, if we query the DB run_pipes = self.results.not_yet_computed(pipelines, dataset, subject) if len(run_pipes) == 0: continue # get the data X, y, metadata = self.paradigm.get_data( dataset, [subject], self.return_epochs ) le = LabelEncoder() y = y if self.mne_labels else le.fit_transform(y) groups = metadata.session.values scorer = get_scorer(self.paradigm.scoring) for name, clf in run_pipes.items(): # we want to store a results per session cv = LeaveOneGroupOut() for train, test in cv.split(X, y, groups): t_start = time() if isinstance(X, BaseEpochs): cvclf = clone(clf) cvclf.fit(X[train], y[train]) score = scorer(cvclf, X[test], y[test]) else: result = _fit_and_score( clone(clf), X, y, scorer, train, test, verbose=False, parameters=None, fit_params=None, error_score=self.error_score, ) score = result["test_scores"] duration = time() - t_start nchan = X.info["nchan"] if isinstance(X, BaseEpochs) else X.shape[1] res = { "time": duration, "dataset": dataset, "subject": subject, "session": groups[test][0], "score": score, "n_samples": len(train), "n_channels": nchan, "pipeline": name, } yield res
def test_classification_scorer_sample_weight(): # Test that classification scorers support sample_weight or raise sensible # errors # Unlike the metrics invariance test, in the scorer case it's harder # to ensure that, on the classifier output, weighted and unweighted # scores really should be unequal. X, y = make_classification(random_state=0) _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0) split = train_test_split(X, y, y_ml, random_state=0) X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split sample_weight = np.ones_like(y_test) sample_weight[:10] = 0 # get sensible estimators for each metric estimator = _make_estimators(X_train, y_train, y_ml_train) for name in get_scorer_names(): scorer = get_scorer(name) if name in REGRESSION_SCORERS: # skip the regression scores continue if name == "top_k_accuracy": # in the binary case k > 1 will always lead to a perfect score scorer._kwargs = {"k": 1} if name in MULTILABEL_ONLY_SCORERS: target = y_ml_test else: target = y_test try: weighted = scorer( estimator[name], X_test, target, sample_weight=sample_weight ) ignored = scorer(estimator[name], X_test[10:], target[10:]) unweighted = scorer(estimator[name], X_test, target) assert weighted != unweighted, ( f"scorer {name} behaves identically when called with " f"sample weights: {weighted} vs {unweighted}" ) assert_almost_equal( weighted, ignored, err_msg=( f"scorer {name} behaves differently " "when ignoring samples and setting " f"sample_weight to 0: {weighted} vs {ignored}" ), ) except TypeError as e: assert "sample_weight" in str(e), ( f"scorer {name} raises unhelpful exception when called " f"with sample weights: {str(e)}" )
def get_metric(metric, gib=True, needs_proba=False, needs_threshold=False): """Get the right metric depending on the input type. Parameters ---------- metric: str or callable Metric as a string, function or scorer. gib: bool, optional (default=True) whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. needs_proba: bool, optional (default=False) Whether the metric function requires probability estimates of a classifier. Is ignored if the metric is a string or a scorer. needs_threshold: bool, optional (default=False) Whether the metric function takes a continuous decision certainty. Is ignored if the metric is a string or a scorer. Returns ------- scorer: callable Scorer object. """ def get_scorer_name(scorer): """Return the name of the provided scorer.""" for key, value in SCORERS.items(): if scorer.__dict__ == value.__dict__: return key if isinstance(metric, str): if metric.lower() in METRIC_ACRONYMS: metric = METRIC_ACRONYMS[metric.lower()] elif metric not in SCORERS: raise ValueError("Unknown value for the metric parameter, got " f"{metric}. Try one of: {', '.join(SCORERS)}.") metric = get_scorer(metric) metric.name = get_scorer_name(metric) elif hasattr(metric, "_score_func"): # Provided metric is scoring metric.name = get_scorer_name(metric) else: # Metric is a function with signature metric(y, y_pred) metric = make_scorer( score_func=metric, greater_is_better=gib, needs_proba=needs_proba, needs_threshold=needs_threshold, ) metric.name = metric._score_func.__name__ return metric
def test_multiclass_roc_no_proba_scorer_errors(scorer_name): # Perceptron has no predict_proba scorer = get_scorer(scorer_name) X, y = make_classification(n_classes=3, n_informative=3, n_samples=20, random_state=0) lr = Perceptron().fit(X, y) msg = "'Perceptron' object has no attribute 'predict_proba'" with pytest.raises(AttributeError, match=msg): scorer(lr, X, y)
def create_feature_reselection_experiment(maker=None, **user_kwargs): exp_kwargs = \ dict(scorer=get_scorer('neg_median_absolute_error'), drift_detection=False, feature_reselection=True, feature_reselection_estimator_size=10, feature_reselection_strategy='quantile', feature_reselection_threshold=0.1, feature_reselection_quantile=0.5, feature_reselection_number=None) return _create_experiment(exp_kwargs, maker=maker, need_test=True, user_kwargs=user_kwargs)
def inhome_permutation_importance(estimator, feature_groups, X, y, scoring='f1_macro', n_repeats=10, random_state=23): result = {'score_difference': np.zeros((len(feature_groups), n_repeats)), 'feature_group_names': np.zeros(len(feature_groups), dtype='O')} X_train_original, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=random_state) feature_list_indices = {col: idx for idx, col in enumerate(X_train_original.columns)} scorer = get_scorer(scoring) for i, (feature_group_name, feature_list) in enumerate(feature_groups.items()): # print(feature_group_name) score_difference = [] for j in range(n_repeats): np.random.seed(random_state+j) X_train_permuted = X_train_original.copy() # permute feature values in selected columns for col in feature_list: # print(col) col_idx = feature_list_indices[col] permuted_indices = np.random.permutation(X_train_original.shape[0]) # col = pd.DataFrame(np.random.uniform(low=-1.0, high=1.0, size=X_train_original.shape[0])) # fill with random values from U(-1, 1) col = X_train_permuted.iloc[permuted_indices, col_idx] # permute present values col.index = X_train_permuted.index X_train_permuted.iloc[:, col_idx] = col # X_train_permuted = X_train_permuted.drop(columns=feature_list) # train model using OLD data matrix X_train_original and evaluate est_original = estimator.fit(X_train_original, y_train) score_original = scorer(est_original, X_test, y_test) # train model using NEW data matrix X_train_permuted and evaluate est_permuted = estimator.fit(X_train_permuted, y_train) score_permuted = scorer(est_permuted, X_test, y_test) result['score_difference'][i, j] = score_original - score_permuted result['feature_group_names'][i] = feature_group_name return result
def pack_score(y_test_true_all, y_test_predict_all, scoring): if scoring == 'neg_root_mean_squared_error': return np.sqrt(np.mean((y_test_true_all - y_test_predict_all)**2)) scorer = get_scorer(scoring) scorer_func = scorer._score_func score = scorer_func(y_test_true_all, y_test_predict_all) return score
def test_multiclass_roc_proba_scorer(scorer_name, metric): scorer = get_scorer(scorer_name) X, y = make_classification(n_classes=3, n_informative=3, n_samples=20, random_state=0) lr = LogisticRegression(multi_class="multinomial").fit(X, y) y_proba = lr.predict_proba(X) expected_score = metric(y, y_proba) assert scorer(lr, X, y) == pytest.approx(expected_score)
def test_classification_binary_scores(scorer_name, metric): # check consistency between score and scorer for scores supporting # binary classification. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LinearSVC(random_state=0) clf.fit(X_train, y_train) score = get_scorer(scorer_name)(clf, X_test, y_test) expected_score = metric(y_test, clf.predict(X_test)) assert_almost_equal(score, expected_score)
def run_grid_search(estimator, param_grid, metric, X, y, X_test, y_test, seed, profile): _train_test_iter = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=seed) inner_cv_func = lambda zx, zy: StratifiedShuffleSplit( zy, n_iter=10, test_size=0.2, random_state=seed) if metric == 'avgprec': scoring_func = get_scorer("average_precision") else: scoring_func = get_scorer("roc_auc") _grid_search = NestedGridSearchCV(estimator, param_grid, scoring_func, cv=_train_test_iter, inner_cv=inner_cv_func, profile=profile) _grid_search.fit(X, y, X_test=X_test, y_test=y_test) return _grid_search
def __init__(self, estimator, k_features=1, forward=True, floating=False, verbose=0, scoring=None, cv=5, n_jobs=1, pre_dispatch='2*n_jobs', clone_estimator=True): self.estimator = estimator self.k_features = k_features self.forward = forward self.floating = floating self.pre_dispatch = pre_dispatch # Want to raise meaningful error message if a # cross-validation generator is inputted if isinstance(cv, types.GeneratorType): err_msg = ('Input cv is a generator object, which is not ' 'supported. Instead please input an iterable yielding ' 'train, test splits. This can usually be done by ' 'passing a cross-validation generator to the ' 'built-in list function. I.e. cv=list(<cv-generator>)') raise TypeError(err_msg) self.cv = cv self.n_jobs = n_jobs self.verbose = verbose self.named_est = {key: value for key, value in _name_estimators([self.estimator])} self.clone_estimator = clone_estimator if self.clone_estimator: self.est_ = clone(self.estimator) else: self.est_ = self.estimator self.scoring = scoring if scoring is None: if self.est_._estimator_type == 'classifier': scoring = 'accuracy' elif self.est_._estimator_type == 'regressor': scoring = 'r2' else: raise AttributeError('Estimator must ' 'be a Classifier or Regressor.') if isinstance(scoring, str): self.scorer = get_scorer(scoring) else: self.scorer = scoring self.fitted = False self.subsets_ = {} self.interrupted_ = False # don't mess with this unless testing self._TESTING_INTERRUPT_MODE = False
def __init__(self, estimator, k_features='best', forward=True, floating=False, print_progress=False, verbose=1, scoring=None, cv=5, skip_if_stuck=True, n_jobs=1, pre_dispatch='2*n_jobs', clone_estimator=True): if print_progress: warnings.warn( "The print_progress parameter " "has been deprecated in " "0.4.3 and will be removed in 0.5.0. " "Please use the verbose parameter instead.", DeprecationWarning) if verbose == 0: verbose = 1 self.estimator = estimator self.k_features = k_features self.forward = forward self.floating = floating self.pre_dispatch = pre_dispatch self.scoring = scoring if isinstance(scoring, str): self.scorer = get_scorer(scoring) else: self.scorer = scoring self.skip_if_stuck = skip_if_stuck self.cv = cv self.n_jobs = n_jobs self.verbose = verbose self.named_est = { key: value for key, value in _name_estimators([self.estimator]) } self.clone_estimator = clone_estimator if self.clone_estimator: self.est_ = clone(self.estimator) else: self.est_ = self.estimator self.fitted = False self.subsets_ = {} self.interrupted_ = False # don't mess with this unless testing self._TESTING_INTERRUPT_MODE = False
def test_get_feature_shap_values_per_fold(X, y): clf = DecisionTreeClassifier(max_depth=1) shap_values, train_score, test_score = ShapRFECV._get_feature_shap_values_per_fold( X, y, clf, train_index=[2, 3, 4, 5, 6, 7], val_index=[0, 1], scorer=get_scorer('roc_auc')) assert test_score == 1 assert train_score > 0.9 assert shap_values.shape == (2, 3)
def score_explicit(self, clf, X_train, y_train, X_test, y_test): scorer = get_scorer(self.paradigm.scoring) t_start = time() try: model = clf.fit(X_train, y_train) score = _score(model, X_test, y_test, scorer) except ValueError as e: if self.error_score == "raise": raise e score = self.error_score duration = time() - t_start return score, duration
def test_get_feature_shap_values_per_fold(X, y): """ Test with ShapRFECV with features per fold. """ clf = DecisionTreeClassifier(max_depth=1) shap_elimination = ShapRFECV(clf) shap_values, train_score, test_score = shap_elimination._get_feature_shap_values_per_fold( X, y, clf, train_index=[2, 3, 4, 5, 6, 7], val_index=[0, 1], scorer=get_scorer("roc_auc") ) assert test_score == 1 assert train_score > 0.9 assert shap_values.shape == (2, 3)
def _get_scorer_from_string(self, scoring): if scoring == 'my_scorer': if not self.kernel: myfunc = importlib.import_module( 'modules.myfuncs.%s' % self.configs['fit'].get('myfunc')) method_name = 'get_my_scorer' if not self.kernel: method_name = 'myfunc.%s' % method_name scorer = eval(method_name)() else: scorer = get_scorer(scoring) return scorer
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] fit_intercept = filter_ == DENSE_FILTER ridge_gcv = _RidgeGCV(fit_intercept=fit_intercept) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert ridge_gcv2.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert ridge_gcv3.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('neg_mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert ridge_gcv4.alpha_ == pytest.approx(alpha_) # check that we get same best alpha with sample weights if filter_ == DENSE_FILTER: ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert ridge_gcv.alpha_ == pytest.approx(alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred, rtol=1e-5) return ret
def y_randomization_test(yx, model_dicts, crys_prop, scoring='f1_macro', n_repeats=25, savefig=False): fig, ax = plt.subplots(nrows=len(model_dicts[crys_prop]), ncols=1, figsize=(6.5, len(model_dicts[crys_prop])*3), sharex=True) scorer = get_scorer(scoring) for i, (model_name, model) in enumerate(model_dicts[crys_prop].items()): X = yx.iloc[:, 1:].copy() y = yx.iloc[:, 0].astype(int).copy() X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=23) m = copy(model) y_initial_score = scorer(m.fit(X_train, y_train), X_test, y_test) y_randomized_scores = [] for jj in range(n_repeats): X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=23+jj) m = copy(model) np.random.seed(92) np.random.shuffle(y_train) score_y_randomized = scorer(m.fit(X_train, y_train), X_test, y_test) y_randomized_scores.append(score_y_randomized) #y_initial_score = train_test_estimate(X, y, copy(model), scorer) #y_randomized_scores = [] #for _ in range(n_repeats): # np.random.seed(92) # np.random.shuffle(y) # score_y_randomized = train_test_estimate(X, y, copy(model), scorer) # y_randomized_scores.append(score_y_randomized) ax[i].hist(y_randomized_scores, bins=15, color='blue', label='y-randomized') ax[i].axvline(y_initial_score, color='red', lw=4, label='y-initial') ax[i].set_title('%s model' % (model_name)) ax[i].legend(loc='upper left') ax[i].set_xlim(0.0, 1.0) ax[len(model_dicts[crys_prop])-1].set_xlabel(scoring) fig.suptitle(f'$\Delta${crys_prop} prediction', y=.995, fontsize=20) fig.tight_layout(rect=[0, 0, 1, 0.975]) if savefig: fig.savefig(f'y_randomization_test_{crys_prop}.png', dpi=dpi) plt.show()
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = get_scorer('accuracy') rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))
def custom_paired_ttest_cv(estimator1, estimator2, X, y, cv=10, scoring=None, shuffle=False, random_seed=None): kf = StratifiedKFold(random_state=random_seed, n_splits=cv, shuffle=True) if scoring is None: if estimator1._estimator_type == 'classifier': scoring = 'accuracy' elif estimator1._estimator_type == 'regressor': scoring = 'r2' else: raise AttributeError('Estimator must ' 'be a Classifier or Regressor.') if isinstance(scoring, str): scorer = get_scorer(scoring) else: scorer = scoring score_diff = [] # this is probably wrong :( for train_index, test_index in kf.split(X, y): ##### THIS IS WHERE IT BECOMES "CUSTOM" if isinstance(X, pd.DataFrame): X_train = X.iloc[train_index] X_test = X.iloc[test_index] else: X_train = [X[i] for i in train_index] X_test = [X[i] for i in test_index] ##### y_train, y_test = y[train_index], y[test_index] estimator1.fit(X_train, y_train) estimator2.fit(X_train, y_train) est1_score = scorer(estimator1, X_test, y_test) est2_score = scorer(estimator2, X_test, y_test) score_diff.append(est1_score - est2_score) avg_diff = np.mean(score_diff) numerator = avg_diff * np.sqrt(cv) denominator = np.sqrt(sum([(diff - avg_diff)**2 for diff in score_diff]) / (cv - 1)) t_stat = numerator / denominator pvalue = stats.t.sf(np.abs(t_stat), cv - 1)*2. return float(t_stat), float(pvalue)
def fit_and_dump(_x, _y, args): data = _x.copy() _x = categorical_to_numeric(_x) _y = _y[args.event].cat.codes.values model = create_estimator(data, _y, args.seed) if args.metric == 'avgprec': scoring_func = get_scorer("average_precision") else: scoring_func = get_scorer("roc_auc") model.set_params(scorer=scoring_func) print("Number of base estimators: %d" % len(model.base_estimators)) print("Purging MongoDB cv_scores database") client = MongoClient(mongodb_host) db = client.ensemble_selection_classification db.cv_scores.remove({}) print("Fitting %r" % model) _create_directories(args.models_dir, model.base_estimators) return model.fit(_x.values, _y)
def train(self, X, y, X_test): # searcher = MCTSSearcher(self.search_space_fn, use_meta_learner=self.use_meta_learner, max_node_space=10, # candidates_size=10, # optimize_direction=OptimizeDirection.Maximize) searcher = EvolutionSearcher( self.search_space_fn, optimize_direction=self.optimize_direction, population_size=30, sample_size=10, regularized=True, candidates_size=10, use_meta_learner=self.use_meta_learner) # searcher = RandomSearcher(lambda: search_space_general(early_stopping_rounds=20, verbose=0), # optimize_direction=OptimizeDirection.Maximize) es = EarlyStoppingCallback(self.earlystop_rounds, self.optimize_direction, time_limit=self.time_limit, expected_reward=self.expected_reward) hk = HyperGBM(searcher, reward_metric=self.reward_metric, cache_dir=f'hypergbm_cache', clear_cache=True, callbacks=[es, SummaryCallback()]) log_callback = ConsoleCallback() self.experiment = CompeteExperiment( hk, X, y, X_test=X_test, eval_size=self.eval_size, train_test_split_strategy=self.train_test_split_strategy, cv=self.cv, num_folds=self.num_folds, callbacks=[], scorer=get_scorer(self.scorer), drop_feature_with_collinearity=self.drop_feature_with_collinearity, drift_detection=True, n_est_feature_importance=5, importance_threshold=1e-5, two_stage_importance_selection=self.two_stage_importance_selection, ensemble_size=self.ensemble_size, pseudo_labeling=self.pseudo_labeling, pseudo_labeling_proba_threshold=self. pseudo_labeling_proba_threshold, pseudo_labeling_resplit=self.pseudo_labeling_resplit, retrain_on_wholedata=self.retrain_on_wholedata, ) self.estimator = self.experiment.run(use_cache=self.use_cache, max_trials=self.max_trials)
def score(self, X, y): """Score each estimator/data slice couple. Parameters ---------- X : array, shape (n_samples, nd_features, n_estimators) The input samples. For each data slice, the corresponding estimator scores the prediction: e.g. [estimators[ii].score(X[..., ii], y) for ii in range(n_estimators)] The feature dimension can be multidimensional e.g. X.shape = (n_samples, n_features_1, n_features_2, n_estimators) y : array, shape (n_samples,) | (n_samples, n_targets) The target values. Returns ------- score : array, shape (n_samples, n_estimators) Score for each estimator / data slice couple. """ from sklearn.metrics import make_scorer, get_scorer self._check_Xy(X) if X.shape[-1] != len(self.estimators_): raise ValueError('The number of estimators does not match ' 'X.shape[-1]') # If scoring is None (default), the predictions are internally # generated by estimator.score(). Else, we must first get the # predictions based on the scorer. if not isinstance(self.scoring, str): scoring_ = (make_scorer(self.scoring) if self.scoring is not None else self.scoring) elif self.scoring is not None: scoring_ = get_scorer(self.scoring) # For predictions/transforms the parallelization is across the data and # not across the estimators to avoid memory load. parallel, p_func, n_jobs = parallel_func(_sl_score, self.n_jobs) n_jobs = min(n_jobs, X.shape[-1]) X_splits = np.array_split(X, n_jobs, axis=-1) est_splits = np.array_split(self.estimators_, n_jobs) score = parallel( p_func(est, scoring_, X, y) for (est, x) in zip(est_splits, X_splits)) if n_jobs > 1: score = np.concatenate(score, axis=0) else: score = score[0] return score
def __init__(self, estimator, k_features=1, forward=True, floating=False, verbose=0, scoring=None, cv=5, n_jobs=1, pre_dispatch='2*n_jobs', clone_estimator=True): self.estimator = estimator self.k_features = k_features self.forward = forward self.floating = floating self.pre_dispatch = pre_dispatch # Want to raise meaningful error message if a # cross-validation generator is inputted if isinstance(cv, types.GeneratorType): err_msg = ('Input cv is a generator object, which is not ' 'supported. Instead please input an iterable yielding ' 'train, test splits. This can usually be done by ' 'passing a cross-validation generator to the ' 'built-in list function. I.e. cv=list(<cv-generator>)') raise TypeError(err_msg) self.cv = cv self.n_jobs = n_jobs self.verbose = verbose self.clone_estimator = clone_estimator if self.clone_estimator: self.est_ = clone(self.estimator) else: self.est_ = self.estimator self.scoring = scoring if scoring is None: if self.est_._estimator_type == 'classifier': scoring = 'accuracy' elif self.est_._estimator_type == 'regressor': scoring = 'r2' else: raise AttributeError('Estimator must ' 'be a Classifier or Regressor.') if isinstance(scoring, str): self.scorer = get_scorer(scoring) else: self.scorer = scoring self.fitted = False self.subsets_ = {} self.interrupted_ = False # don't mess with this unless testing self._TESTING_INTERRUPT_MODE = False
def score(self, X, y): """Score each estimator/data slice couple. Parameters ---------- X : array, shape (n_samples, nd_features, n_estimators) The input samples. For each data slice, the corresponding estimator scores the prediction: e.g. [estimators[ii].score(X[..., ii], y) for ii in range(n_estimators)] The feature dimension can be multidimensional e.g. X.shape = (n_samples, n_features_1, n_features_2, n_estimators) y : array, shape (n_samples,) | (n_samples, n_targets) The target values. Returns ------- score : array, shape (n_samples, n_estimators) Score for each estimator / data slice couple. """ from sklearn.metrics import make_scorer, get_scorer self._check_Xy(X) if X.shape[-1] != len(self.estimators_): raise ValueError('The number of estimators does not match ' 'X.shape[-1]') # If scoring is None (default), the predictions are internally # generated by estimator.score(). Else, we must first get the # predictions based on the scorer. if not isinstance(self.scoring, str): scoring_ = (make_scorer(self.scoring) if self.scoring is not None else self.scoring) elif self.scoring is not None: scoring_ = get_scorer(self.scoring) # For predictions/transforms the parallelization is across the data and # not across the estimators to avoid memory load. parallel, p_func, n_jobs = parallel_func(_sl_score, self.n_jobs) n_jobs = min(n_jobs, X.shape[-1]) X_splits = np.array_split(X, n_jobs, axis=-1) est_splits = np.array_split(self.estimators_, n_jobs) score = parallel(p_func(est, scoring_, X, y) for (est, x) in zip(est_splits, X_splits)) if n_jobs > 1: score = np.concatenate(score, axis=0) else: score = score[0] return score
def evaluate(self, X, y, X_vald=None, y_vald=None): clf = RandomForestClassifier(n_estimators=32, max_depth=3, n_jobs=-1) # used as base classifier if X_vald is None: return cross_val_score(clf, X, y, scoring=self._metric, cv=self._cv_folds, n_jobs=-1).mean() else: clf.fit(X, y) sk = get_scorer(self._metric) return sk(clf, X_vald, y_vald)
def __init__(self, n_jobs=-1, offset_scale=1.0, n_buckets=2, initial_offsets=None, scoring='accuracy'): self.n_jobs = int(n_jobs) self.offset_scale = float(offset_scale) self.n_buckets = int(n_buckets) if initial_offsets is None: self.initial_offsets_ = [-0.5] * self.n_buckets pass else: self.initial_offsets_ = list(initial_offsets) assert(len(self.initial_offsets_) == self.n_buckets) pass from sklearn.metrics import get_scorer self.scoring = get_scorer(scoring) pass
def __init__(self, estimator, k_features=1, forward=True, floating=False, verbose=0, scoring=None, cv=5, skip_if_stuck=True, n_jobs=1, pre_dispatch='2*n_jobs', clone_estimator=True): self.estimator = estimator self.k_features = k_features self.forward = forward self.floating = floating self.pre_dispatch = pre_dispatch self.cv = cv self.n_jobs = n_jobs self.named_est = {key: value for key, value in _name_estimators([self.estimator])} self.skip_if_stuck = skip_if_stuck self.cv = cv self.n_jobs = n_jobs self.verbose = verbose self.named_est = {key: value for key, value in _name_estimators([self.estimator])} self.clone_estimator = clone_estimator if self.clone_estimator: self.est_ = clone(self.estimator) else: self.est_ = self.estimator self.scoring = scoring if scoring is None: if self.est_._estimator_type == 'classifier': scoring = 'accuracy' elif self.est_._estimator_type == 'regressor': scoring = 'r2' else: raise AttributeError('Estimator must ' 'be a Classifier or Regressor.') if isinstance(scoring, str): self.scorer = get_scorer(scoring) else: self.scorer = scoring self.fitted = False self.subsets_ = {} self.interrupted_ = False # don't mess with this unless testing self._TESTING_INTERRUPT_MODE = False
def __init__(self, estimator, k_features=1, forward=True, floating=False, verbose=0, scoring=None, cv=5, n_jobs=1, pre_dispatch='2*n_jobs', clone_estimator=True): self.estimator = estimator self.k_features = k_features self.forward = forward self.floating = floating self.pre_dispatch = pre_dispatch self.cv = cv self.n_jobs = n_jobs self.named_est = {key: value for key, value in _name_estimators([self.estimator])} self.cv = cv self.n_jobs = n_jobs self.verbose = verbose self.named_est = {key: value for key, value in _name_estimators([self.estimator])} self.clone_estimator = clone_estimator if self.clone_estimator: self.est_ = clone(self.estimator) else: self.est_ = self.estimator self.scoring = scoring if scoring is None: if self.est_._estimator_type == 'classifier': scoring = 'accuracy' elif self.est_._estimator_type == 'regressor': scoring = 'r2' else: raise AttributeError('Estimator must ' 'be a Classifier or Regressor.') if isinstance(scoring, str): self.scorer = get_scorer(scoring) else: self.scorer = scoring self.fitted = False self.subsets_ = {} self.interrupted_ = False # don't mess with this unless testing self._TESTING_INTERRUPT_MODE = False
def __init__(self, estimator, k_features, forward=True, floating=False, print_progress=True, scoring='accuracy', cv=5, skip_if_stuck=True, n_jobs=1, pre_dispatch='2*n_jobs'): self.estimator = clone(estimator) self.k_features = k_features self.forward = forward self.floating = floating self.pre_dispatch = pre_dispatch self.scoring = scoring self.scorer = get_scorer(scoring) self.skip_if_stuck = skip_if_stuck self.cv = cv self.print_progress = print_progress self.n_jobs = n_jobs
def test_deprecated_names(): X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) for name in ('mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'log_loss'): warning_msg = "Scoring method %s was renamed to" % name for scorer in (get_scorer(name), SCORERS[name]): assert_warns_message(DeprecationWarning, warning_msg, scorer, clf, X, y) assert_warns_message(DeprecationWarning, warning_msg, cross_val_score, clf, X, y, scoring=name)
def __init__(self, estimator, min_features=1, max_features=1, print_progress=True, scoring='accuracy', cv=5, n_jobs=1, pre_dispatch='2*n_jobs', clone_estimator=True): self.estimator = estimator self.min_features = min_features self.max_features = max_features self.pre_dispatch = pre_dispatch self.scoring = scoring self.scorer = get_scorer(scoring) self.cv = cv self.print_progress = print_progress self.n_jobs = n_jobs self.named_est = {key: value for key, value in _name_estimators([self.estimator])} self.clone_estimator = clone_estimator if self.clone_estimator: self.est_ = clone(self.estimator) else: self.est_ = self.estimator self.fitted = False
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = get_scorer('accuracy') rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_))) # In the event of cross validation score ties, the expected behavior of # RFECV is to return the FEWEST features that maximize the CV score. # Because test_scorer always returns 1.0 in this example, RFECV should # reduce the dimensionality to a single feature (i.e. n_features_ = 1) assert_equal(rfecv.n_features_, 1) # Same as the first two tests, but with step=2 rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) rfecv.fit(X, y) assert_equal(len(rfecv.grid_scores_), 6) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Verifying that steps < 1 don't blow up. rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=.2, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data)
def paired_ttest_5x2cv(estimator1, estimator2, X, y, scoring=None, random_seed=None): """ Implements the 5x2cv paired t test proposed by Dieterrich (1998) to compare the performance of two models. Parameters ---------- estimator1 : scikit-learn classifier or regressor estimator2 : scikit-learn classifier or regressor X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. scoring : str, callable, or None (default: None) If None (default), uses 'accuracy' for sklearn classifiers and 'r2' for sklearn regressors. If str, uses a sklearn scoring metric string identifier, for example {accuracy, f1, precision, recall, roc_auc} for classifiers, {'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error', 'median_absolute_error', 'r2'} for regressors. If a callable object or function is provided, it has to be conform with sklearn's signature ``scorer(estimator, X, y)``; see http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html for more information. random_seed : int or None (default: None) Random seed for creating the test/train splits. Returns ---------- t : float The t-statistic pvalue : float Two-tailed p-value. If the chosen significance level is larger than the p-value, we reject the null hypothesis and accept that there are significant differences in the two compared models. """ rng = np.random.RandomState(random_seed) if scoring is None: if estimator1._estimator_type == 'classifier': scoring = 'accuracy' elif estimator1._estimator_type == 'regressor': scoring = 'r2' else: raise AttributeError('Estimator must ' 'be a Classifier or Regressor.') if isinstance(scoring, str): scorer = get_scorer(scoring) else: scorer = scoring variance_sum = 0. first_diff = None def score_diff(X_1, X_2, y_1, y_2): estimator1.fit(X_1, y_1) estimator2.fit(X_1, y_1) est1_score = scorer(estimator1, X_2, y_2) est2_score = scorer(estimator2, X_2, y_2) score_diff = est1_score - est2_score return score_diff for i in range(5): randint = rng.randint(low=0, high=32767) X_1, X_2, y_1, y_2 = \ train_test_split(X, y, test_size=0.5, random_state=randint) score_diff_1 = score_diff(X_1, X_2, y_1, y_2) score_diff_2 = score_diff(X_2, X_1, y_2, y_1) score_mean = (score_diff_1 + score_diff_2) / 2. score_var = ((score_diff_1 - score_mean)**2 + (score_diff_2 - score_mean)**2) variance_sum += score_var if first_diff is None: first_diff = score_diff_1 numerator = first_diff denominator = np.sqrt(1/5. * variance_sum) t_stat = numerator / denominator pvalue = stats.t.sf(np.abs(t_stat), 5)*2. return float(t_stat), float(pvalue)
def tune(X, y, estimator, param_grid): gcv = GridSearchCV(estimator, param_grid, refit=True, scoring=get_scorer('roc_auc'), n_jobs=-1, verbose=5) gcv.fit(X, y) return gcv.best_estimator_
def paired_ttest_resampled(estimator1, estimator2, X, y, num_rounds=30, test_size=0.3, scoring=None, random_seed=None): """ Implements the resampled paired t test procedure to compare the performance of two models (also called k-hold-out paired t test). Parameters ---------- estimator1 : scikit-learn classifier or regressor estimator2 : scikit-learn classifier or regressor X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. num_rounds : int (default: 30) Number of resampling iterations (i.e., train/test splits) test_size : float or int (default: 0.3) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to use as a test set. If int, represents the absolute number of test exsamples. scoring : str, callable, or None (default: None) If None (default), uses 'accuracy' for sklearn classifiers and 'r2' for sklearn regressors. If str, uses a sklearn scoring metric string identifier, for example {accuracy, f1, precision, recall, roc_auc} for classifiers, {'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error', 'median_absolute_error', 'r2'} for regressors. If a callable object or function is provided, it has to be conform with sklearn's signature ``scorer(estimator, X, y)``; see http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html for more information. random_seed : int or None (default: None) Random seed for creating the test/train splits. Returns ---------- t : float The t-statistic pvalue : float Two-tailed p-value. If the chosen significance level is larger than the p-value, we reject the null hypothesis and accept that there are significant differences in the two compared models. """ if (not isinstance(test_size, int) and not isinstance(test_size, float)): raise ValueError('train_size must be of ' 'type int or float. Got %s.' % type(test_size)) rng = np.random.RandomState(random_seed) if scoring is None: if estimator1._estimator_type == 'classifier': scoring = 'accuracy' elif estimator1._estimator_type == 'regressor': scoring = 'r2' else: raise AttributeError('Estimator must ' 'be a Classifier or Regressor.') if isinstance(scoring, str): scorer = get_scorer(scoring) else: scorer = scoring score_diff = [] for i in range(num_rounds): randint = rng.randint(low=0, high=32767) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size, random_state=randint) estimator1.fit(X_train, y_train) estimator2.fit(X_train, y_train) est1_score = scorer(estimator1, X_test, y_test) est2_score = scorer(estimator2, X_test, y_test) score_diff.append(est1_score - est2_score) avg_diff = np.mean(score_diff) numerator = avg_diff * np.sqrt(num_rounds) denominator = np.sqrt(sum([(diff - avg_diff)**2 for diff in score_diff]) / (num_rounds - 1)) t_stat = numerator / denominator pvalue = stats.t.sf(np.abs(t_stat), num_rounds - 1)*2. return float(t_stat), float(pvalue)
def combined_ftest_5x2cv(estimator1, estimator2, X, y, scoring=None, random_seed=None): """ Implements the 5x2cv combined F test proposed by Alpaydin 1999, to compare the performance of two models. Parameters ---------- estimator1 : scikit-learn classifier or regressor estimator2 : scikit-learn classifier or regressor X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. scoring : str, callable, or None (default: None) If None (default), uses 'accuracy' for sklearn classifiers and 'r2' for sklearn regressors. If str, uses a sklearn scoring metric string identifier, for example {accuracy, f1, precision, recall, roc_auc} for classifiers, {'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error', 'median_absolute_error', 'r2'} for regressors. If a callable object or function is provided, it has to be conform with sklearn's signature ``scorer(estimator, X, y)``; see http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html for more information. random_seed : int or None (default: None) Random seed for creating the test/train splits. Returns ---------- f : float The F-statistic pvalue : float Two-tailed p-value. If the chosen significance level is larger than the p-value, we reject the null hypothesis and accept that there are significant differences in the two compared models. Examples ----------- For usage examples, please see http://rasbt.github.io/mlxtend/user_guide/evaluate/combined_ftest_5x2cv/ """ rng = np.random.RandomState(random_seed) if scoring is None: if estimator1._estimator_type == 'classifier': scoring = 'accuracy' elif estimator1._estimator_type == 'regressor': scoring = 'r2' else: raise AttributeError('Estimator must ' 'be a Classifier or Regressor.') if isinstance(scoring, str): scorer = get_scorer(scoring) else: scorer = scoring variances = [] differences = [] def score_diff(X_1, X_2, y_1, y_2): estimator1.fit(X_1, y_1) estimator2.fit(X_1, y_1) est1_score = scorer(estimator1, X_2, y_2) est2_score = scorer(estimator2, X_2, y_2) score_diff = est1_score - est2_score return score_diff for i in range(5): randint = rng.randint(low=0, high=32767) X_1, X_2, y_1, y_2 = \ train_test_split(X, y, test_size=0.5, random_state=randint) score_diff_1 = score_diff(X_1, X_2, y_1, y_2) score_diff_2 = score_diff(X_2, X_1, y_2, y_1) score_mean = (score_diff_1 + score_diff_2) / 2. score_var = ((score_diff_1 - score_mean)**2 + (score_diff_2 - score_mean)**2) differences.extend([score_diff_1**2, score_diff_2**2]) variances.append(score_var) numerator = sum(differences) denominator = 2*(sum(variances)) f_stat = numerator / denominator p_value = scipy.stats.f.sf(f_stat, 10, 5) return float(f_stat), float(p_value)
def test_SearchLight(): """Test _SearchLight""" from sklearn.linear_model import Ridge, LogisticRegression from sklearn.pipeline import make_pipeline from sklearn.metrics import roc_auc_score, get_scorer, make_scorer X, y = make_data() n_epochs, _, n_time = X.shape # init assert_raises(ValueError, _SearchLight, 'foo') sl = _SearchLight(Ridge()) sl = _SearchLight(LogisticRegression()) # fit assert_equal(sl.__repr__()[:14], '<_SearchLight(') sl.fit(X, y) assert_equal(sl.__repr__()[-28:], ', fitted with 10 estimators>') assert_raises(ValueError, sl.fit, X[1:], y) assert_raises(ValueError, sl.fit, X[:, :, 0], y) # transforms assert_raises(ValueError, sl.predict, X[:, :, :2]) y_pred = sl.predict(X) assert_true(y_pred.dtype == int) assert_array_equal(y_pred.shape, [n_epochs, n_time]) y_proba = sl.predict_proba(X) assert_true(y_proba.dtype == float) assert_array_equal(y_proba.shape, [n_epochs, n_time, 2]) # score score = sl.score(X, y) assert_array_equal(score.shape, [n_time]) assert_true(np.sum(np.abs(score)) != 0) assert_true(score.dtype == float) sl = _SearchLight(LogisticRegression()) assert_equal(sl.scoring, None) # Scoring method for err, scoring in [(ValueError, 'foo'), (TypeError, 999)]: sl = _SearchLight(LogisticRegression(), scoring=scoring) sl.fit(X, y) assert_raises(err, sl.score, X, y) # Check sklearn's roc_auc fix: scikit-learn/scikit-learn#6874 # -- 3 class problem sl = _SearchLight(LogisticRegression(random_state=0), scoring='roc_auc') y = np.arange(len(X)) % 3 sl.fit(X, y) assert_raises(ValueError, sl.score, X, y) # -- 2 class problem not in [0, 1] y = np.arange(len(X)) % 2 + 1 sl.fit(X, y) score = sl.score(X, y) assert_array_equal(score, [roc_auc_score(y - 1, _y_pred - 1) for _y_pred in sl.decision_function(X).T]) y = np.arange(len(X)) % 2 for method, scoring in [ ('predict_proba', 'roc_auc'), ('predict', roc_auc_score)]: sl1 = _SearchLight(LogisticRegression(), scoring=scoring) sl1.fit(X, y) np.random.seed(0) X = np.random.randn(*X.shape) # randomize X to avoid AUCs in [0, 1] score_sl = sl1.score(X, y) assert_array_equal(score_sl.shape, [n_time]) assert_true(score_sl.dtype == float) # Check that scoring was applied adequately if isinstance(scoring, str): scoring = get_scorer(scoring) else: scoring = make_scorer(scoring) score_manual = [scoring(est, x, y) for est, x in zip( sl1.estimators_, X.transpose(2, 0, 1))] assert_array_equal(score_manual, score_sl) # n_jobs sl = _SearchLight(LogisticRegression(random_state=0), n_jobs=1, scoring='roc_auc') score_1job = sl.fit(X, y).score(X, y) sl.n_jobs = 2 score_njobs = sl.fit(X, y).score(X, y) assert_array_equal(score_1job, score_njobs) sl.predict(X) # n_jobs > n_estimators sl.fit(X[..., [0]], y) sl.predict(X[..., [0]]) # pipeline class _LogRegTransformer(LogisticRegression): # XXX needs transformer in pipeline to get first proba only def transform(self, X): return super(_LogRegTransformer, self).predict_proba(X)[..., 1] pipe = make_pipeline(_SearchLight(_LogRegTransformer()), LogisticRegression()) pipe.fit(X, y) pipe.predict(X) # n-dimensional feature space X = np.random.rand(10, 3, 4, 2) y = np.arange(10) % 2 y_preds = list() for n_jobs in [1, 2]: pipe = _SearchLight(make_pipeline(Vectorizer(), LogisticRegression()), n_jobs=n_jobs) y_preds.append(pipe.fit(X, y).predict(X)) features_shape = pipe.estimators_[0].steps[0][1].features_shape_ assert_array_equal(features_shape, [3, 4]) assert_array_equal(y_preds[0], y_preds[1])
def _test_ridge_loo(filter_): # test that can work with both dense or sparse matrices n_samples = X_diabetes.shape[0] ret = [] ridge_gcv = _RidgeGCV(fit_intercept=False) ridge = Ridge(alpha=1.0, fit_intercept=False) # generalized cross-validation (efficient leave-one-out) decomp = ridge_gcv._pre_compute(X_diabetes, y_diabetes) errors, c = ridge_gcv._errors(1.0, y_diabetes, *decomp) values, c = ridge_gcv._values(1.0, y_diabetes, *decomp) # brute-force leave-one-out: remove one example at a time errors2 = [] values2 = [] for i in range(n_samples): sel = np.arange(n_samples) != i X_new = X_diabetes[sel] y_new = y_diabetes[sel] ridge.fit(X_new, y_new) value = ridge.predict([X_diabetes[i]])[0] error = (y_diabetes[i] - value) ** 2 errors2.append(error) values2.append(value) # check that efficient and brute-force LOO give same results assert_almost_equal(errors, errors2) assert_almost_equal(values, values2) # generalized cross-validation (efficient leave-one-out, # SVD variation) decomp = ridge_gcv._pre_compute_svd(X_diabetes, y_diabetes) errors3, c = ridge_gcv._errors_svd(ridge.alpha, y_diabetes, *decomp) values3, c = ridge_gcv._values_svd(ridge.alpha, y_diabetes, *decomp) # check that efficient and SVD efficient LOO give same results assert_almost_equal(errors, errors3) assert_almost_equal(values, values3) # check best alpha ridge_gcv.fit(filter_(X_diabetes), y_diabetes) alpha_ = ridge_gcv.alpha_ ret.append(alpha_) # check that we get same best alpha with custom loss_func f = ignore_warnings scoring = make_scorer(mean_squared_error, greater_is_better=False) ridge_gcv2 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv2.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv2.alpha_, alpha_) # check that we get same best alpha with custom score_func func = lambda x, y: -mean_squared_error(x, y) scoring = make_scorer(func) ridge_gcv3 = RidgeCV(fit_intercept=False, scoring=scoring) f(ridge_gcv3.fit)(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv3.alpha_, alpha_) # check that we get same best alpha with a scorer scorer = get_scorer('mean_squared_error') ridge_gcv4 = RidgeCV(fit_intercept=False, scoring=scorer) ridge_gcv4.fit(filter_(X_diabetes), y_diabetes) assert_equal(ridge_gcv4.alpha_, alpha_) # check that we get same best alpha with sample weights ridge_gcv.fit(filter_(X_diabetes), y_diabetes, sample_weight=np.ones(n_samples)) assert_equal(ridge_gcv.alpha_, alpha_) # simulate several responses Y = np.vstack((y_diabetes, y_diabetes)).T ridge_gcv.fit(filter_(X_diabetes), Y) Y_pred = ridge_gcv.predict(filter_(X_diabetes)) ridge_gcv.fit(filter_(X_diabetes), y_diabetes) y_pred = ridge_gcv.predict(filter_(X_diabetes)) assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=5) return ret
def paired_ttest_kfold_cv(estimator1, estimator2, X, y, cv=10, scoring=None, shuffle=False, random_seed=None): """ Implements the k-fold paired t test procedure to compare the performance of two models. Parameters ---------- estimator1 : scikit-learn classifier or regressor estimator2 : scikit-learn classifier or regressor X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. cv : int (default: 10) Number of splits and iteration for the cross-validation procedure scoring : str, callable, or None (default: None) If None (default), uses 'accuracy' for sklearn classifiers and 'r2' for sklearn regressors. If str, uses a sklearn scoring metric string identifier, for example {accuracy, f1, precision, recall, roc_auc} for classifiers, {'mean_absolute_error', 'mean_squared_error'/'neg_mean_squared_error', 'median_absolute_error', 'r2'} for regressors. If a callable object or function is provided, it has to be conform with sklearn's signature ``scorer(estimator, X, y)``; see http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html for more information. shuffle : bool (default: True) Whether to shuffle the dataset for generating the k-fold splits. random_seed : int or None (default: None) Random seed for shuffling the dataset for generating the k-fold splits. Ignored if shuffle=False. Returns ---------- t : float The t-statistic pvalue : float Two-tailed p-value. If the chosen significance level is larger than the p-value, we reject the null hypothesis and accept that there are significant differences in the two compared models. """ kf = KFold(n_splits=cv, random_state=random_seed, shuffle=shuffle) if scoring is None: if estimator1._estimator_type == 'classifier': scoring = 'accuracy' elif estimator1._estimator_type == 'regressor': scoring = 'r2' else: raise AttributeError('Estimator must ' 'be a Classifier or Regressor.') if isinstance(scoring, str): scorer = get_scorer(scoring) else: scorer = scoring score_diff = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] estimator1.fit(X_train, y_train) estimator2.fit(X_train, y_train) est1_score = scorer(estimator1, X_test, y_test) est2_score = scorer(estimator2, X_test, y_test) score_diff.append(est1_score - est2_score) avg_diff = np.mean(score_diff) numerator = avg_diff * np.sqrt(cv) denominator = np.sqrt(sum([(diff - avg_diff)**2 for diff in score_diff]) / (cv - 1)) t_stat = numerator / denominator pvalue = stats.t.sf(np.abs(t_stat), cv - 1)*2. return float(t_stat), float(pvalue)