def check_classifier_ratio(clf, method, cv): # Passing distributions directly p0 = Normal(mu=0.0) p1 = Normal(mu=0.1) ratio = ClassifierRatio( CalibratedClassifierCV(base_estimator=clf, method=method, cv=cv)) ratio.fit(numerator=p0, denominator=p1, n_samples=10000) reals = np.linspace(-1, 1, num=100).reshape(-1, 1) assert ratio.score(reals, p0.pdf(reals) / p1.pdf(reals)) > -0.1 assert np.mean( np.abs(np.log(ratio.predict(reals)) - ratio.predict(reals, log=True))) < 0.01 # Passing X, y only X = np.vstack((p0.rvs(5000), p1.rvs(5000))) y = np.zeros(10000, dtype=np.int) y[5000:] = 1 ratio = ClassifierRatio( CalibratedClassifierCV(base_estimator=clf, method=method, cv=cv)) ratio.fit(X=X, y=y) reals = np.linspace(-1, 1, num=100).reshape(-1, 1) assert ratio.score(reals, p0.pdf(reals) / p1.pdf(reals)) > -0.1 assert np.mean( np.abs(np.log(ratio.predict(reals)) - ratio.predict(reals, log=True))) < 0.01
def calibrated_predict( self, X: np.ndarray, theta: np.ndarray, n_samples_per_theta: int, simulator_func: Callable, calibration_params: Dict, log=True, return_calibrated_model=False, ): cal_clf = CalibratedClassifierCV(base_estimator=self.clf, cv='prefit', **calibration_params) cal_model = self.__class__(theta_0=self.theta_0, clf=cal_clf) calibration_ds = SinglyParameterizedRatioDataset.from_simulator( simulator_func=simulator_func, theta_0=self.theta_0, theta_1_iterator=SingleParamIterator(theta), n_samples_per_theta=n_samples_per_theta) cal_model.fit(X=calibration_ds.x, theta_1s=calibration_ds.theta_1s, y=calibration_ds.y) theta_1s = stack_repeat(theta, len(X)) pred = cal_model.predict(X=X, theta_1s=theta_1s, log=log) if return_calibrated_model: return pred, cal_model else: return pred
def test_classifier_ratio_identity(): p = Normal(mu=0.0) ratio = ClassifierRatio( CalibratedClassifierCV(base_estimator=ElasticNetCV())) ratio.fit(numerator=p, denominator=p, n_samples=10000) reals = np.linspace(-0.5, 1.0, num=100).reshape(-1, 1) assert ratio.score(reals, p.pdf(reals) / p.pdf(reals)) == 0.0 assert_array_almost_equal(ratio.predict(reals), np.ones(len(reals))) assert_array_almost_equal(ratio.predict(reals, log=True), np.zeros(len(reals)))
def test_decomposed_ratio_identity(): components = [Normal(mu=0.0), Normal(mu=0.25), Normal(mu=0.5)] p = Mixture(components=components, weights=[0.45, 0.1, 0.45]) ratio = DecomposedRatio( ClassifierRatio(CalibratedClassifierCV(base_estimator=ElasticNetCV()))) ratio.fit(numerator=p, denominator=p, n_samples=10000) reals = np.linspace(-0.5, 1.0, num=100).reshape(-1, 1) assert ratio.score(reals, p.pdf(reals) / p.pdf(reals)) == 0.0 assert_array_almost_equal(ratio.predict(reals), np.ones(len(reals))) assert_array_almost_equal(ratio.predict(reals, log=True), np.zeros(len(reals)))
def test_decomposed_ratio(): components = [Normal(mu=0.0), Normal(mu=0.25), Normal(mu=0.5)] p0 = Mixture(components=components, weights=[0.45, 0.1, 0.45]) p1 = Mixture(components=[components[0]] + [components[2]]) ratio = DecomposedRatio( ClassifierRatio(CalibratedClassifierCV(base_estimator=ElasticNetCV()))) ratio.fit(numerator=p0, denominator=p1, n_samples=10000) reals = np.linspace(-0.5, 1.0, num=100).reshape(-1, 1) assert ratio.score(reals, p0.pdf(reals) / p1.pdf(reals)) > -0.1 assert np.mean(np.abs(np.log(ratio.predict(reals)) - ratio.predict(reals, log=True))) < 0.01
def make_ratio(num): X_num = Xs_s[num] X_den = X1_s X = np.vstack((X_num, X_den)) y = np.zeros(len(X_num) + len(X_den), dtype=np.int) y[len(X_num):] = 1 clf = ExtraTreesClassifier(n_estimators=100, min_samples_split=20, random_state=0, n_jobs=-1) #clf = KerasClassifier(make_model_join, nb_epoch=50, verbose=0) cv = StratifiedShuffleSplit(n_iter=3, test_size=0.5, random_state=1) ratio = ClassifierRatio( base_estimator=CalibratedClassifierCV(clf, cv=cv, bins=20), random_state=0) ratio.fit(X, y) print('Loss {0} : {1}'.format(num, log_loss(ratio.classifier_.classifiers_[0]. predict(X[:int(len(X)*0.3)]),y[:int(len(X)*0.3)]))) return ratio
def check_calibration(method): # Adpated from sklearn/tests/test_calibration.py # Authors: Alexandre Gramfort # License: BSD 3 clause n_samples = 100 X, y = make_classification(n_samples=2 * n_samples, n_features=6, random_state=42) X -= X.min() # MultinomialNB only allows positive X # split train and test X_train, y_train = X[:n_samples], y[:n_samples] X_test, y_test = X[n_samples:], y[n_samples:] # Naive-Bayes clf = MultinomialNB().fit(X_train, y_train) prob_pos_clf = clf.predict_proba(X_test)[:, 1] pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1) assert_raises(ValueError, pc_clf.fit, X, y) pc_clf = CalibratedClassifierCV(clf, method=method, cv=2) # Note that this fit overwrites the fit on the entire training set pc_clf.fit(X_train, y_train) prob_pos_pc_clf = pc_clf.predict_proba(X_test)[:, 1] # Check that brier score has improved after calibration assert_greater(brier_score_loss(y_test, prob_pos_clf), brier_score_loss(y_test, prob_pos_pc_clf)) # Check invariance against relabeling [0, 1] -> [1, 2] pc_clf.fit(X_train, y_train + 1) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [-1, 1] pc_clf.fit(X_train, 2 * y_train - 1) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [1, 0] pc_clf.fit(X_train, (y_train + 1) % 2) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(X_test)[:, 1] if method == "sigmoid": assert_array_almost_equal(prob_pos_pc_clf, 1 - prob_pos_pc_clf_relabeled) else: # Isotonic calibration is not invariant against relabeling # but should improve in both cases assert_greater( brier_score_loss(y_test, prob_pos_clf), brier_score_loss((y_test + 1) % 2, prob_pos_pc_clf_relabeled))
def check_calibration(method): # Adpated from sklearn/tests/test_calibration.py # Authors: Alexandre Gramfort # License: BSD 3 clause n_samples = 100 X, y = make_classification(n_samples=2 * n_samples, n_features=6, random_state=42) X -= X.min() # MultinomialNB only allows positive X # split train and test X_train, y_train = X[:n_samples], y[:n_samples] X_test, y_test = X[n_samples:], y[n_samples:] # Naive-Bayes clf = MultinomialNB().fit(X_train, y_train) prob_pos_clf = clf.predict_proba(X_test)[:, 1] pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1) assert_raises(ValueError, pc_clf.fit, X, y) pc_clf = CalibratedClassifierCV(clf, method=method, cv=2) # Note that this fit overwrites the fit on the entire training set pc_clf.fit(X_train, y_train) prob_pos_pc_clf = pc_clf.predict_proba(X_test)[:, 1] # Check that brier score has improved after calibration assert_greater(brier_score_loss(y_test, prob_pos_clf), brier_score_loss(y_test, prob_pos_pc_clf)) # Check invariance against relabeling [0, 1] -> [1, 2] pc_clf.fit(X_train, y_train + 1) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [-1, 1] pc_clf.fit(X_train, 2 * y_train - 1) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [1, 0] pc_clf.fit(X_train, (y_train + 1) % 2) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(X_test)[:, 1] if method == "sigmoid": assert_array_almost_equal(prob_pos_pc_clf, 1 - prob_pos_pc_clf_relabeled) else: # Isotonic calibration is not invariant against relabeling # but should improve in both cases assert_greater( brier_score_loss(y_test, prob_pos_clf), brier_score_loss((y_test + 1) % 2, prob_pos_pc_clf_relabeled) )