Ejemplo n.º 1
0
def test_transition_features():
    expl = Explanation(
        estimator='some estimator',
        targets=[
            TargetExplanation('class1',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('pos', 13, value=1)],
                                  neg=[],
                              )),
            TargetExplanation('class2',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('pos', 13, value=1)],
                                  neg=[],
                              )),
        ],
        transition_features=TransitionFeatureWeights(
            class_names=['class2', 'class1'],  # reverse on purpose
            coef=np.array([[1.5, 2.5], [3.5, 4.5]]),
        ))
    df_dict = format_as_dataframes(expl)
    assert isinstance(df_dict, dict)
    assert set(df_dict) == {'targets', 'transition_features'}
    assert df_dict['targets'].equals(format_as_dataframe(expl.targets))
    df = df_dict['transition_features']
    print(df)
    print(format_as_text(expl))
    assert str(df) == ('to      class2  class1\n'
                       'from                  \n'
                       'class2     1.5     2.5\n'
                       'class1     3.5     4.5')

    with pytest.warns(UserWarning):
        single_df = format_as_dataframe(expl)
    assert single_df.equals(df)
Ejemplo n.º 2
0
def test_feature_importances(with_std, with_value):
    expl = Explanation(estimator='some estimator',
                       feature_importances=FeatureImportances(
                           importances=[
                               FeatureWeight('a',
                                             1,
                                             std=0.1 if with_std else None,
                                             value=1 if with_value else None),
                               FeatureWeight('b',
                                             2,
                                             std=0.2 if with_std else None,
                                             value=3 if with_value else None),
                           ],
                           remaining=10,
                       ))
    df_dict = format_as_dataframes(expl)
    assert isinstance(df_dict, dict)
    assert list(df_dict) == ['feature_importances']
    df = df_dict['feature_importances']
    expected_df = pd.DataFrame({'weight': [1, 2]}, index=['a', 'b'])
    if with_std:
        expected_df['std'] = [0.1, 0.2]
    if with_value:
        expected_df['value'] = [1, 3]
    print(df, expected_df, sep='\n')
    assert expected_df.equals(df)

    single_df = format_as_dataframe(expl)
    assert expected_df.equals(single_df)
Ejemplo n.º 3
0
def test_targets_with_value():
    expl = Explanation(
        estimator='some estimator',
        targets=[
            TargetExplanation('y',
                              feature_weights=FeatureWeights(
                                  pos=[
                                      FeatureWeight('a', 13, value=1),
                                      FeatureWeight('b', 5, value=2)
                                  ],
                                  neg=[
                                      FeatureWeight('neg1', -10, value=3),
                                      FeatureWeight('neg2', -1, value=4)
                                  ],
                              )),
            TargetExplanation('y2',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('f', 1, value=5)],
                                  neg=[],
                              )),
        ],
    )
    df = format_as_dataframe(expl)
    expected_df = pd.DataFrame(
        {
            'weight': [13, 5, -1, -10, 1],
            'value': [1, 2, 4, 3, 5]
        },
        columns=['weight', 'value'],
        index=pd.MultiIndex.from_tuples([('y', 'a'), ('y', 'b'), ('y', 'neg2'),
                                         ('y', 'neg1'), ('y2', 'f')],
                                        names=['target', 'feature']))
    print(df, expected_df, sep='\n')
    assert expected_df.equals(df)
Ejemplo n.º 4
0
def _features(indices, feature_names, coef, x):
    names = mask(feature_names, indices)
    weights = mask(coef, indices)
    if x is not None:
        values = mask(x, indices)
        return [FeatureWeight(name, weight, value=value)
                for name, weight, value in zip(names, weights, values)]
    else:
        return [FeatureWeight(name, weight)
                for name, weight in zip(names, weights)]
Ejemplo n.º 5
0
def test_targets(with_std, with_value):
    expl = Explanation(
        estimator='some estimator',
        targets=[
            TargetExplanation(
                'y',
                feature_weights=FeatureWeights(
                    pos=[
                        FeatureWeight('a',
                                      13,
                                      std=0.13 if with_std else None,
                                      value=2 if with_value else None),
                        FeatureWeight('b',
                                      5,
                                      std=0.5 if with_std else None,
                                      value=1 if with_value else None)
                    ],
                    neg=[
                        FeatureWeight('neg1',
                                      -10,
                                      std=0.2 if with_std else None,
                                      value=5 if with_value else None),
                        FeatureWeight('neg2',
                                      -1,
                                      std=0.3 if with_std else None,
                                      value=4 if with_value else None)
                    ],
                )),
            TargetExplanation('y2',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('f', 1)],
                                  neg=[],
                              )),
        ],
    )
    df_dict = format_as_dataframes(expl)
    assert isinstance(df_dict, dict)
    assert list(df_dict) == ['targets']
    df = df_dict['targets']
    expected_df = pd.DataFrame(
        {
            'target': ['y', 'y', 'y', 'y', 'y2'],
            'feature': ['a', 'b', 'neg2', 'neg1', 'f'],
            'weight': [13, 5, -1, -10, 1]
        },
        columns=['target', 'feature', 'weight'])
    if with_std:
        expected_df['std'] = [0.13, 0.5, 0.3, 0.2, None]
    if with_value:
        expected_df['value'] = [2, 1, 4, 5, None]
    print(df, expected_df, sep='\n')
    assert expected_df.equals(df)

    single_df = format_as_dataframe(expl)
    assert expected_df.equals(single_df)
Ejemplo n.º 6
0
 def __init__(self, *args, **kwargs):
     self.dictionary = kwargs['dictionary'] if 'dictionary' in kwargs else None
     self.formatted_value = kwargs['formatted_value'] if 'formatted_value' in kwargs else None
     self.score = kwargs['score'] if 'score' in kwargs else None
     if 'dictionary' in kwargs:
         del kwargs['dictionary']
     if 'formatted_value' in kwargs:
         del kwargs['formatted_value']
     if 'score' in kwargs:
         del kwargs['score']
     FeatureWeight.__init__(self, *args, **kwargs)
Ejemplo n.º 7
0
def test_transition_features():
    expl = Explanation(
        estimator='some estimator',
        targets=[
            TargetExplanation('class1',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('pos', 13, value=1)],
                                  neg=[],
                              )),
            TargetExplanation('class2',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('pos', 13, value=1)],
                                  neg=[],
                              )),
        ],
        transition_features=TransitionFeatureWeights(
            class_names=['class2', 'class1'],  # reverse on purpose
            coef=np.array([[1.5, 2.5], [3.5, 4.5]]),
        ))
    df_dict = format_as_dataframes(expl)
    assert isinstance(df_dict, dict)
    assert set(df_dict) == {'targets', 'transition_features'}
    assert df_dict['targets'].equals(format_as_dataframe(expl.targets))
    df = df_dict['transition_features']
    print(df)
    print(format_as_text(expl))
    expected = pd.DataFrame([
        {
            'from': 'class2',
            'to': 'class2',
            'coef': 1.5
        },
        {
            'from': 'class2',
            'to': 'class1',
            'coef': 2.5
        },
        {
            'from': 'class1',
            'to': 'class2',
            'coef': 3.5
        },
        {
            'from': 'class1',
            'to': 'class1',
            'coef': 4.5
        },
    ],
                            columns=['from', 'to', 'coef'])
    assert df.equals(expected)
    with pytest.warns(UserWarning):
        single_df = format_as_dataframe(expl)
    assert single_df.equals(df)
Ejemplo n.º 8
0
def _get_other(feature_weights, named_found_features):
    # type: (FeatureWeights, List[Tuple[str, FoundFeatures]]) -> FeatureWeights
    # search for items that were not accounted at all.
    other_items = []  # type: List[FeatureWeight]
    accounted_keys = set()  # type: Set[Tuple[str, int]]
    all_found_features = set()  # type: Set[Tuple[str, int]]
    for _, found_features in named_found_features:
        all_found_features.update(found_features)

    for group in ['pos', 'neg']:
        for idx, fw in enumerate(getattr(feature_weights, group)):
            key = (group, idx)
            if key not in all_found_features and key not in accounted_keys:
                other_items.append(fw)
                accounted_keys.add(key)

    for vec_name, found_features in named_found_features:
        if found_features:
            other_items.append(
                FeatureWeight(feature=FormattedFeatureName(
                    '{}Highlighted in text (sum)'.format(
                        '{}: '.format(vec_name) if vec_name else '')),
                              weight=sum(found_features.values())))

    other_items.sort(key=lambda x: abs(x.weight), reverse=True)
    return FeatureWeights(
        pos=[fw for fw in other_items if fw.weight >= 0],
        neg=[fw for fw in other_items if fw.weight < 0],
        pos_remaining=feature_weights.pos_remaining,
        neg_remaining=feature_weights.neg_remaining,
    )
Ejemplo n.º 9
0
def explain_decision_tree(
        estimator,
        vec=None,
        top=_TOP,
        target_names=None,
        targets=None,  # ignored
        feature_names=None,
        feature_re=None,
        feature_filter=None,
        **export_graphviz_kwargs):
    """
    Return an explanation of a decision tree.

    See :func:`eli5.explain_weights` for description of
    ``top``, ``target_names``, ``feature_names``,
    ``feature_re`` and ``feature_filter`` parameters.

    ``targets`` parameter is ignored.

    ``vec`` is a vectorizer instance used to transform
    raw features to the input of the estimator (e.g. a fitted
    CountVectorizer instance); you can pass it instead of ``feature_names``.

    All other keyword arguments are passed to
    `sklearn.tree.export_graphviz`_ function.

    .. _sklearn.tree.export_graphviz: http://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html
    """
    feature_names = get_feature_names(estimator,
                                      vec,
                                      feature_names=feature_names)
    coef = estimator.feature_importances_
    tree_feature_names = feature_names
    feature_names, flt_indices = feature_names.handle_filter(
        feature_filter, feature_re)
    if flt_indices is not None:
        coef = coef[flt_indices]
    indices = argsort_k_largest_positive(coef, top)
    names, values = feature_names[indices], coef[indices]
    export_graphviz_kwargs.setdefault("proportion", True)
    tree_info = get_tree_info(estimator,
                              feature_names=tree_feature_names,
                              class_names=target_names,
                              **export_graphviz_kwargs)

    return Explanation(
        feature_importances=FeatureImportances(
            [FeatureWeight(*x) for x in zip(names, values)],
            remaining=np.count_nonzero(coef) - len(indices),
        ),
        decision_tree=tree_info,
        description=DESCRIPTION_DECISION_TREE,
        estimator=repr(estimator),
        method='decision tree',
    )
Ejemplo n.º 10
0
def test_format_as_dict():
    assert format_as_dict(
        Explanation(
            estimator='some estimator',
            targets=[
                TargetExplanation('y',
                                  feature_weights=FeatureWeights(pos=[
                                      FeatureWeight('a', np.float32(13.0))
                                  ],
                                                                 neg=[])),
            ],
        )) == {
            'estimator':
            'some estimator',
            'targets': [
                {
                    'target': 'y',
                    'feature_weights': {
                        'pos': [{
                            'feature': 'a',
                            'weight': 13.0,
                            'std': None,
                            'value': None
                        }],
                        'pos_remaining':
                        0,
                        'neg': [],
                        'neg_remaining':
                        0,
                    },
                    'score': None,
                    'proba': None,
                    'weighted_spans': None,
                    'heatmap': None,
                },
            ],
            'decision_tree':
            None,
            'description':
            None,
            'error':
            None,
            'feature_importances':
            None,
            'highlight_spaces':
            None,
            'is_regression':
            False,
            'method':
            None,
            'transition_features':
            None,
            'image':
            None,
        }
Ejemplo n.º 11
0
def explain_weights_xgboost(xgb,
                            vec=None,
                            top=20,
                            target_names=None,  # ignored
                            targets=None,  # ignored
                            feature_names=None,
                            feature_re=None,
                            feature_filter=None,
                            importance_type='gain',
                            ):
    """
    Return an explanation of an XGBoost estimator (via scikit-learn wrapper
    XGBClassifier or XGBRegressor) as feature importances.

    See :func:`eli5.explain_weights` for description of
    ``top``, ``feature_names``,
    ``feature_re`` and ``feature_filter`` parameters.

    ``target_names`` and ``targets`` parameters are ignored.

    Parameters
    ----------
    importance_type : str, optional
        A way to get feature importance. Possible values are:

        - 'gain' - the average gain of the feature when it is used in trees
          (default)
        - 'weight' - the number of times a feature is used to split the data
          across all trees
        - 'cover' - the average coverage of the feature when it is used in trees
    """
    coef = _xgb_feature_importances(xgb, importance_type=importance_type)
    num_features = coef.shape[-1]
    feature_names = get_feature_names(
        xgb, vec, feature_names=feature_names, num_features=num_features)

    feature_names, flt_indices = feature_names.handle_filter(
        feature_filter, feature_re)
    if flt_indices is not None:
        coef = coef[flt_indices]

    indices = argsort_k_largest_positive(coef, top)
    names, values = feature_names[indices], coef[indices]
    return Explanation(
        feature_importances=FeatureImportances(
            [FeatureWeight(*x) for x in zip(names, values)],
            remaining=np.count_nonzero(coef) - len(indices),
        ),
        description=DESCRIPTION_XGBOOST,
        estimator=repr(xgb),
        method='feature importances',
        is_regression=isinstance(xgb, XGBRegressor),
    )
Ejemplo n.º 12
0
def explain_decision_tree(
        clf,
        vec=None,
        top=_TOP,
        target_names=None,
        targets=None,  # ignored
        feature_names=None,
        feature_re=None,
        **export_graphviz_kwargs):
    """
    Return an explanation of a decision tree classifier in the
    following format (compatible with random forest explanations)::

        Explanation(
            estimator="<classifier repr>",
            method="<interpretation method>",
            description="<human readable description>",
            decision_tree={...tree information},
            feature_importances=[
                FeatureWeight(feature_name, importance, std_deviation),
                ...
            ]
        )

    """
    feature_names = get_feature_names(clf, vec, feature_names=feature_names)
    coef = clf.feature_importances_
    tree_feature_names = feature_names
    if feature_re is not None:
        feature_names, flt_indices = feature_names.filtered_by_re(feature_re)
        coef = coef[flt_indices]
    indices = argsort_k_largest(coef, top)
    names, values = feature_names[indices], coef[indices]
    std = np.zeros_like(values)
    export_graphviz_kwargs.setdefault("proportion", True)
    tree_info = get_tree_info(clf,
                              feature_names=tree_feature_names,
                              class_names=target_names,
                              **export_graphviz_kwargs)

    return Explanation(
        feature_importances=[
            FeatureWeight(*x) for x in zip(names, values, std)
        ],
        decision_tree=tree_info,
        description=DESCRIPTION_DECISION_TREE,
        estimator=repr(clf),
        method='decision tree',
    )
Ejemplo n.º 13
0
def explain_rf_feature_importance(
    estimator,
    vec=None,
    top=_TOP,
    target_names=None,  # ignored
    targets=None,  # ignored
    feature_names=None,
    feature_re=None,
    feature_filter=None,
):
    """
    Return an explanation of a tree-based ensemble estimator.

    See :func:`eli5.explain_weights` for description of
    ``top``, ``feature_names``, ``feature_re`` and ``feature_filter``
    parameters.

    ``target_names`` and ``targets`` parameters are ignored.

    ``vec`` is a vectorizer instance used to transform
    raw features to the input of the estimator (e.g. a fitted
    CountVectorizer instance); you can pass it instead of ``feature_names``.
    """
    feature_names = get_feature_names(estimator,
                                      vec,
                                      feature_names=feature_names)
    coef = estimator.feature_importances_
    trees = np.array(estimator.estimators_).ravel()
    coef_std = np.std([tree.feature_importances_ for tree in trees], axis=0)

    feature_names, flt_indices = feature_names.handle_filter(
        feature_filter, feature_re)
    if flt_indices is not None:
        coef = coef[flt_indices]
        coef_std = coef_std[flt_indices]

    indices = argsort_k_largest_positive(coef, top)
    names, values, std = feature_names[indices], coef[indices], coef_std[
        indices]
    return Explanation(
        feature_importances=FeatureImportances(
            [FeatureWeight(*x) for x in zip(names, values, std)],
            remaining=np.count_nonzero(coef) - len(indices),
        ),
        description=DESCRIPTION_RANDOM_FOREST,
        estimator=repr(estimator),
        method='feature importances',
    )
Ejemplo n.º 14
0
def explain_rf_feature_importance(
        clf,
        vec=None,
        top=_TOP,
        target_names=None,  # ignored
        targets=None,  # ignored
        feature_names=None,
        feature_re=None):
    """
    Return an explanation of a tree-based ensemble classifier in the
    following format::

        Explanation(
            estimator="<classifier repr>",
            method="<interpretation method>",
            description="<human readable description>",
            feature_importances=[
                FeatureWeight(feature_name, importance, std_deviation),
                ...
            ]
        )
    """
    feature_names = get_feature_names(clf, vec, feature_names=feature_names)
    coef = clf.feature_importances_
    trees = np.array(clf.estimators_).ravel()
    coef_std = np.std([tree.feature_importances_ for tree in trees], axis=0)

    if feature_re is not None:
        feature_names, flt_indices = feature_names.filtered_by_re(feature_re)
        coef = coef[flt_indices]
        coef_std = coef_std[flt_indices]

    indices = argsort_k_largest(coef, top)
    names, values, std = feature_names[indices], coef[indices], coef_std[
        indices]
    return Explanation(
        feature_importances=[
            FeatureWeight(*x) for x in zip(names, values, std)
        ],
        description=DESCRIPTION_RANDOM_FOREST,
        estimator=repr(clf),
        method='feature importances',
    )
Ejemplo n.º 15
0
def _get_other(feature_weights, feature_weights_dict, found_features):
    # search for items that were not accounted at all.
    other_items = []
    accounted_keys = set()  # type: Set[Tuple[str, int]]
    for feature, (_, key) in feature_weights_dict.items():
        if key not in found_features and key not in accounted_keys:
            group, idx = key
            other_items.append(getattr(feature_weights, group)[idx])
            accounted_keys.add(key)
    if found_features:
        other_items.append(
            FeatureWeight(FormattedFeatureName('Highlighted in text (sum)'),
                          sum(found_features.values())))
    other_items.sort(key=lambda x: abs(x.weight), reverse=True)
    return FeatureWeights(
        pos=[fw for fw in other_items if fw.weight >= 0],
        neg=[fw for fw in other_items if fw.weight < 0],
        pos_remaining=feature_weights.pos_remaining,
        neg_remaining=feature_weights.neg_remaining,
    )
Ejemplo n.º 16
0
def _features(indices, feature_names, coef):
    names = mask(feature_names, indices)
    values = mask(coef, indices)
    return [FeatureWeight(name, weight) for name, weight in zip(names, values)]