def _train_local_classifier( estimator, samples, similarity, predict_proba, expand_factor=10, test_size=0.3, random_state=None, ): # type: (Any, Any, np.ndarray, Callable[[Any], np.ndarray], int, float, Any) -> Dict[str, float] rng = check_random_state(random_state) y_proba = predict_proba(samples) (X_train, X_test, similarity_train, similarity_test, y_proba_train, y_proba_test) = train_test_split(samples, similarity, y_proba, test_size=test_size, random_state=rng) # XXX: in the original lime code instead of a probabilitsic classifier # they build several regression models which try to output probabilities. # # XXX: Probability information is helpful because it could be hard # to get enough examples of all classes automatically, so we're fitting # classifier to produce the same probabilities, not only the same # best answer. # TODO: feature selection # Ideally, it should be supported as a Pipeline (i.e. user should # be able to configure it). fit_proba(estimator, X_train, y_proba_train, expand_factor=expand_factor, sample_weight=similarity_train, random_state=rng) y_proba_test_pred = estimator.predict_proba(X_test) return { 'mean_KL_divergence': mean_kl_divergence(y_proba_test_pred, y_proba_test, sample_weight=similarity_test), 'score': score_with_sample_weight(estimator, X_test, y_proba_test.argmax(axis=1), sample_weight=similarity_test) }
def test_fit_proba(): X = np.array([ [0.0, 0.8], [0.0, 0.5], [1.0, 0.1], [0.9, 0.2], [0.7, 0.3], ]) y_proba = np.array([ [0.0, 1.0], [0.1, 0.9], [1.0, 0.0], [0.55, 0.45], [0.4, 0.6], ]) y_bin = y_proba.argmax(axis=1) # fit on binary labels clf = LogisticRegression(C=10, random_state=42) clf.fit(X, y_bin) y_pred = clf.predict_proba(X)[:, 1] mae = mean_absolute_error(y_proba[:, 1], y_pred) print(y_pred, mae) # fit on probabilities clf2 = LogisticRegression(C=10, random_state=42) fit_proba(clf2, X, y_proba, expand_factor=200, random_state=42) y_pred2 = clf2.predict_proba(X)[:, 1] mae2 = mean_absolute_error(y_proba[:, 1], y_pred2) print(y_pred2, mae2) assert mae2 * 1.2 < mae # let's get 3th example really right sample_weight = np.array([0.1, 0.1, 0.1, 10.0, 0.1]) clf3 = LogisticRegression(C=10, random_state=42) fit_proba(clf3, X, y_proba, expand_factor=200, sample_weight=sample_weight, random_state=42) y_pred3 = clf3.predict_proba(X)[:, 1] print(y_pred3) val = y_proba[3][1] assert abs(y_pred3[3] - val) * 1.5 < abs(y_pred2[3] - val) assert abs(y_pred3[3] - val) * 1.5 < abs(y_pred[3] - val) # without expand_factor it is just clf.fit clf4 = LogisticRegression(C=10, random_state=42) fit_proba(clf4, X, y_proba, expand_factor=None, random_state=42) y_pred4 = clf4.predict_proba(X)[:, 1] assert np.allclose(y_pred, y_pred4) # it should work the same with sparse data X_sparse = sp.csr_matrix(X) clf4 = LogisticRegression(C=10, random_state=42) fit_proba(clf4, X_sparse, y_proba, expand_factor=200, random_state=42) y_pred4 = clf4.predict_proba(X)[:, 1] assert np.allclose(y_pred2, y_pred4)
def _train_local_classifier(estimator, samples, similarity, # type: np.ndarray y_proba, # type: np.ndarray expand_factor=10, # type: int test_size=0.3, # type: float random_state=None, ): # type: (...) -> Dict[str, float] rng = check_random_state(random_state) (X_train, X_test, similarity_train, similarity_test, y_proba_train, y_proba_test) = train_test_split(samples, similarity, y_proba, test_size=test_size, random_state=rng) # XXX: in the original lime code instead of a probabilitsic classifier # they build several regression models which try to output probabilities. # # XXX: Probability information is helpful because it could be hard # to get enough examples of all classes automatically, so we're fitting # classifier to produce the same probabilities, not only the same # best answer. # TODO: feature selection # Ideally, it should be supported as a Pipeline (i.e. user should # be able to configure it). fit_proba(estimator, X_train, y_proba_train, expand_factor=expand_factor, sample_weight=similarity_train, random_state=rng) y_proba_test_pred = estimator.predict_proba(X_test) if y_proba_test_pred.shape != y_proba_test.shape: # Sometimes generated training labels may contain only a subset of # target classes; it means it could happen that dimensions # of predicted probability matrices don't match. # # XXX: the fix is not complete; to explain predictions # of the fitted estimator one still have to take care of target_names. if not hasattr(estimator, 'classes_'): raise ValueError("Result dimensions don't match and estimator" "doesn't provide 'classes_' attribute; can't" "figure out how are columns related.") seen_classes = estimator.classes_ complete_classes = np.arange(y_proba.shape[1]) y_proba_test_pred = fix_multiclass_predict_proba( y_proba=y_proba_test_pred, seen_classes=seen_classes, complete_classes=complete_classes ) return { 'mean_KL_divergence': mean_kl_divergence( y_proba_test_pred, y_proba_test, sample_weight=similarity_test ), 'score': score_with_sample_weight(estimator, X_test, y_proba_test.argmax(axis=1), sample_weight=similarity_test) }