Exemple #1
0
def _train_local_classifier(
    estimator,
    samples,
    similarity,
    predict_proba,
    expand_factor=10,
    test_size=0.3,
    random_state=None,
):
    # type: (Any, Any, np.ndarray, Callable[[Any], np.ndarray], int, float, Any) -> Dict[str, float]
    rng = check_random_state(random_state)
    y_proba = predict_proba(samples)

    (X_train, X_test, similarity_train, similarity_test, y_proba_train,
     y_proba_test) = train_test_split(samples,
                                      similarity,
                                      y_proba,
                                      test_size=test_size,
                                      random_state=rng)

    # XXX: in the original lime code instead of a probabilitsic classifier
    # they build several regression models which try to output probabilities.
    #
    # XXX: Probability information is helpful because it could be hard
    # to get enough examples of all classes automatically, so we're fitting
    # classifier to produce the same probabilities, not only the same
    # best answer.

    # TODO: feature selection
    # Ideally, it should be supported as a Pipeline (i.e. user should
    # be able to configure it).
    fit_proba(estimator,
              X_train,
              y_proba_train,
              expand_factor=expand_factor,
              sample_weight=similarity_train,
              random_state=rng)

    y_proba_test_pred = estimator.predict_proba(X_test)
    return {
        'mean_KL_divergence':
        mean_kl_divergence(y_proba_test_pred,
                           y_proba_test,
                           sample_weight=similarity_test),
        'score':
        score_with_sample_weight(estimator,
                                 X_test,
                                 y_proba_test.argmax(axis=1),
                                 sample_weight=similarity_test)
    }
Exemple #2
0
def test_fit_proba():
    X = np.array([
        [0.0, 0.8],
        [0.0, 0.5],
        [1.0, 0.1],
        [0.9, 0.2],
        [0.7, 0.3],
    ])
    y_proba = np.array([
        [0.0, 1.0],
        [0.1, 0.9],
        [1.0, 0.0],
        [0.55, 0.45],
        [0.4, 0.6],
    ])
    y_bin = y_proba.argmax(axis=1)

    # fit on binary labels
    clf = LogisticRegression(C=10, random_state=42)
    clf.fit(X, y_bin)
    y_pred = clf.predict_proba(X)[:, 1]
    mae = mean_absolute_error(y_proba[:, 1], y_pred)
    print(y_pred, mae)

    # fit on probabilities
    clf2 = LogisticRegression(C=10, random_state=42)
    fit_proba(clf2, X, y_proba, expand_factor=200, random_state=42)
    y_pred2 = clf2.predict_proba(X)[:, 1]
    mae2 = mean_absolute_error(y_proba[:, 1], y_pred2)
    print(y_pred2, mae2)

    assert mae2 * 1.2 < mae

    # let's get 3th example really right
    sample_weight = np.array([0.1, 0.1, 0.1, 10.0, 0.1])
    clf3 = LogisticRegression(C=10, random_state=42)
    fit_proba(clf3, X, y_proba, expand_factor=200, sample_weight=sample_weight,
              random_state=42)
    y_pred3 = clf3.predict_proba(X)[:, 1]
    print(y_pred3)

    val = y_proba[3][1]
    assert abs(y_pred3[3] - val) * 1.5 < abs(y_pred2[3] - val)
    assert abs(y_pred3[3] - val) * 1.5 < abs(y_pred[3] - val)

    # without expand_factor it is just clf.fit
    clf4 = LogisticRegression(C=10, random_state=42)
    fit_proba(clf4, X, y_proba, expand_factor=None,
              random_state=42)
    y_pred4 = clf4.predict_proba(X)[:, 1]
    assert np.allclose(y_pred, y_pred4)

    # it should work the same with sparse data
    X_sparse = sp.csr_matrix(X)
    clf4 = LogisticRegression(C=10, random_state=42)
    fit_proba(clf4, X_sparse, y_proba, expand_factor=200, random_state=42)
    y_pred4 = clf4.predict_proba(X)[:, 1]
    assert np.allclose(y_pred2, y_pred4)
Exemple #3
0
def _train_local_classifier(estimator,
                            samples,
                            similarity,        # type: np.ndarray
                            y_proba,           # type: np.ndarray
                            expand_factor=10,  # type: int
                            test_size=0.3,     # type: float
                            random_state=None,
                            ):
    # type: (...) -> Dict[str, float]
    rng = check_random_state(random_state)

    (X_train, X_test,
     similarity_train, similarity_test,
     y_proba_train, y_proba_test) = train_test_split(samples,
                                                     similarity,
                                                     y_proba,
                                                     test_size=test_size,
                                                     random_state=rng)

    # XXX: in the original lime code instead of a probabilitsic classifier
    # they build several regression models which try to output probabilities.
    #
    # XXX: Probability information is helpful because it could be hard
    # to get enough examples of all classes automatically, so we're fitting
    # classifier to produce the same probabilities, not only the same
    # best answer.

    # TODO: feature selection
    # Ideally, it should be supported as a Pipeline (i.e. user should
    # be able to configure it).
    fit_proba(estimator, X_train, y_proba_train,
              expand_factor=expand_factor,
              sample_weight=similarity_train,
              random_state=rng)

    y_proba_test_pred = estimator.predict_proba(X_test)
    if y_proba_test_pred.shape != y_proba_test.shape:
        # Sometimes generated training labels may contain only a subset of
        # target classes; it means it could happen that dimensions
        # of predicted probability matrices don't match.
        #
        # XXX: the fix is not complete; to explain predictions
        # of the fitted estimator one still have to take care of target_names.
        if not hasattr(estimator, 'classes_'):
            raise ValueError("Result dimensions don't match and estimator"
                             "doesn't provide 'classes_' attribute; can't"
                             "figure out how are columns related.")
        seen_classes = estimator.classes_
        complete_classes = np.arange(y_proba.shape[1])
        y_proba_test_pred = fix_multiclass_predict_proba(
            y_proba=y_proba_test_pred,
            seen_classes=seen_classes,
            complete_classes=complete_classes
        )

    return {
        'mean_KL_divergence': mean_kl_divergence(
            y_proba_test_pred,
            y_proba_test,
            sample_weight=similarity_test
        ),
        'score': score_with_sample_weight(estimator,
                                          X_test,
                                          y_proba_test.argmax(axis=1),
                                          sample_weight=similarity_test)
    }