def test_explain_prediction_libsvm_linear(clf, doc, *args, **kwargs): if clf.kernel != 'linear': return Explanation( estimator=repr(clf), error="only kernel='linear' is currently supported for " "libsvm-based classifiers", ) if len(getattr(clf, 'classes_', [])) > 2: return Explanation( estimator=repr(clf), error="only binary libsvm-based classifiers are supported", ) return explain_prediction_linear_classifier(clf, doc, *args, **kwargs)
def test_transition_features(): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation('class1', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), TargetExplanation('class2', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), ], transition_features=TransitionFeatureWeights( class_names=['class2', 'class1'], # reverse on purpose coef=np.array([[1.5, 2.5], [3.5, 4.5]]), )) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert set(df_dict) == {'targets', 'transition_features'} assert df_dict['targets'].equals(format_as_dataframe(expl.targets)) df = df_dict['transition_features'] print(df) print(format_as_text(expl)) assert str(df) == ('to class2 class1\n' 'from \n' 'class2 1.5 2.5\n' 'class1 3.5 4.5') with pytest.warns(UserWarning): single_df = format_as_dataframe(expl) assert single_df.equals(df)
def test_feature_importances(with_std, with_value): expl = Explanation(estimator='some estimator', feature_importances=FeatureImportances( importances=[ FeatureWeight('a', 1, std=0.1 if with_std else None, value=1 if with_value else None), FeatureWeight('b', 2, std=0.2 if with_std else None, value=3 if with_value else None), ], remaining=10, )) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert list(df_dict) == ['feature_importances'] df = df_dict['feature_importances'] expected_df = pd.DataFrame({'weight': [1, 2]}, index=['a', 'b']) if with_std: expected_df['std'] = [0.1, 0.2] if with_value: expected_df['value'] = [1, 3] print(df, expected_df, sep='\n') assert expected_df.equals(df) single_df = format_as_dataframe(expl) assert expected_df.equals(single_df)
def test_targets_with_value(): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation('y', feature_weights=FeatureWeights( pos=[ FeatureWeight('a', 13, value=1), FeatureWeight('b', 5, value=2) ], neg=[ FeatureWeight('neg1', -10, value=3), FeatureWeight('neg2', -1, value=4) ], )), TargetExplanation('y2', feature_weights=FeatureWeights( pos=[FeatureWeight('f', 1, value=5)], neg=[], )), ], ) df = format_as_dataframe(expl) expected_df = pd.DataFrame( { 'weight': [13, 5, -1, -10, 1], 'value': [1, 2, 4, 3, 5] }, columns=['weight', 'value'], index=pd.MultiIndex.from_tuples([('y', 'a'), ('y', 'b'), ('y', 'neg2'), ('y', 'neg1'), ('y2', 'f')], names=['target', 'feature'])) print(df, expected_df, sep='\n') assert expected_df.equals(df)
def get_feature_importance_explanation(estimator, vec, coef, feature_names, feature_filter, feature_re, top, description, is_regression, estimator_feature_names=None, num_features=None, coef_std=None): # type: (...) -> Explanation feature_names, flt_indices = get_feature_names_filtered( estimator, vec, feature_names=feature_names, estimator_feature_names=estimator_feature_names, feature_filter=feature_filter, feature_re=feature_re, num_features=num_features, ) feature_importances = get_feature_importances_filtered( coef, feature_names, flt_indices, top, coef_std) return Explanation( feature_importances=feature_importances, description=description, estimator=repr(estimator), method='feature importances', is_regression=is_regression, )
def get_decision_path_explanation(estimator, doc, vec, vectorized, x, feature_names, feature_filter, feature_re, top, original_display_names, target_names, targets, top_targets, is_regression, is_multiclass, proba, get_score_weights): # type: (...) -> Explanation display_names = get_target_display_names( original_display_names, target_names, targets, top_targets, proba) flt_feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re, x) def get_top_features(weights, scale=1.0): return get_top_features_filtered(x, flt_feature_names, flt_indices, weights, top, scale) explanation = Explanation( estimator=repr(estimator), method='decision paths', description={ (False, False): DESCRIPTION_CLF_BINARY, (False, True): DESCRIPTION_CLF_MULTICLASS, (True, False): DESCRIPTION_REGRESSION, }[is_regression, is_multiclass], is_regression=is_regression, targets=[], ) assert explanation.targets is not None if is_multiclass: for label_id, label in display_names: score, all_feature_weights = get_score_weights(label_id) target_expl = TargetExplanation( target=label, feature_weights=get_top_features(all_feature_weights), score=score, proba=proba[label_id] if proba is not None else None, ) add_weighted_spans(doc, vec, vectorized, target_expl) explanation.targets.append(target_expl) else: score, all_feature_weights = get_score_weights(0) if is_regression: target, scale, label_id = display_names[-1][1], 1.0, 1 else: target, scale, label_id = get_binary_target_scale_label_id( score, display_names, proba) target_expl = TargetExplanation( target=target, feature_weights=get_top_features(all_feature_weights, scale), score=score, proba=proba[label_id] if proba is not None else None, ) add_weighted_spans(doc, vec, vectorized, target_expl) explanation.targets.append(target_expl) return explanation
def explain_prediction_linear_classifier(clf, doc, vec=None, top=None, target_names=None, targets=None, feature_names=None, vectorized=False): """ Explain prediction of a linear classifier. """ vec, feature_names = _handle_vec(clf, doc, vec, vectorized, feature_names) X = _get_X(doc, vec=vec, vectorized=vectorized) if is_probabilistic_classifier(clf): try: proba, = clf.predict_proba(X) except NotImplementedError: proba = None else: proba = None score, = clf.decision_function(X) if has_intercept(clf): X = _add_intercept(X) x, = X res = Explanation( estimator=repr(clf), method='linear model', targets=[], ) def _weights(label_id): coef = get_coef(clf, label_id) scores = _multiply(x, coef) return get_top_features(feature_names, scores, top) display_names = get_display_names(clf.classes_, target_names, targets) if is_multiclass_classifier(clf): for label_id, label in display_names: target_expl = TargetExplanation( target=label, feature_weights=_weights(label_id), score=score[label_id], proba=proba[label_id] if proba is not None else None, ) _add_weighted_spans(doc, vec, target_expl) res.targets.append(target_expl) else: target_expl = TargetExplanation( target=display_names[1][1], feature_weights=_weights(0), score=score, proba=proba[1] if proba is not None else None, ) _add_weighted_spans(doc, vec, target_expl) res.targets.append(target_expl) return res
def explain_weights_lightning_not_supported( estimator, vec=None, top=20, target_names=None, targets=None, feature_names=None, coef_scale=None): return Explanation( estimator=repr(estimator), error="Error: estimator %r is not supported" % estimator, )
def explain_prediction_lightning_not_supported( estimator, doc, vec=None, top=None, target_names=None, targets=None, feature_names=None, vectorized=False, coef_scale=None): return Explanation( estimator=repr(estimator), error="Error: estimator %r is not supported" % estimator, )
def explain_weights_sklearn_not_supported( estimator, vec=None, top=_TOP, target_names=None, targets=None, feature_names=None, coef_scale=None, feature_re=None, feature_filter=None): return Explanation( estimator=repr(estimator), error="estimator %r is not supported" % estimator, )
def explain_prediction_keras_not_supported(model, doc): """ Can not do an explanation based on the passed arguments. Did you pass either "image" or "tokens"? """ return Explanation( model.name, error='model "{}" is not supported, ' 'try passing the "image" argument if explaining an image model.'.format(model.name), )
def test_targets(with_std, with_value): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation( 'y', feature_weights=FeatureWeights( pos=[ FeatureWeight('a', 13, std=0.13 if with_std else None, value=2 if with_value else None), FeatureWeight('b', 5, std=0.5 if with_std else None, value=1 if with_value else None) ], neg=[ FeatureWeight('neg1', -10, std=0.2 if with_std else None, value=5 if with_value else None), FeatureWeight('neg2', -1, std=0.3 if with_std else None, value=4 if with_value else None) ], )), TargetExplanation('y2', feature_weights=FeatureWeights( pos=[FeatureWeight('f', 1)], neg=[], )), ], ) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert list(df_dict) == ['targets'] df = df_dict['targets'] expected_df = pd.DataFrame( { 'target': ['y', 'y', 'y', 'y', 'y2'], 'feature': ['a', 'b', 'neg2', 'neg1', 'f'], 'weight': [13, 5, -1, -10, 1] }, columns=['target', 'feature', 'weight']) if with_std: expected_df['std'] = [0.13, 0.5, 0.3, 0.2, None] if with_value: expected_df['value'] = [2, 1, 4, 5, None] print(df, expected_df, sep='\n') assert expected_df.equals(df) single_df = format_as_dataframe(expl) assert expected_df.equals(single_df)
def explain_decision_tree( estimator, vec=None, top=_TOP, target_names=None, targets=None, # ignored feature_names=None, feature_re=None, feature_filter=None, **export_graphviz_kwargs): """ Return an explanation of a decision tree. See :func:`eli5.explain_weights` for description of ``top``, ``target_names``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``targets`` parameter is ignored. ``vec`` is a vectorizer instance used to transform raw features to the input of the estimator (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. All other keyword arguments are passed to `sklearn.tree.export_graphviz`_ function. .. _sklearn.tree.export_graphviz: http://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html """ feature_names = get_feature_names(estimator, vec, feature_names=feature_names) coef = estimator.feature_importances_ tree_feature_names = feature_names feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) if flt_indices is not None: coef = coef[flt_indices] indices = argsort_k_largest_positive(coef, top) names, values = feature_names[indices], coef[indices] export_graphviz_kwargs.setdefault("proportion", True) tree_info = get_tree_info(estimator, feature_names=tree_feature_names, class_names=target_names, **export_graphviz_kwargs) return Explanation( feature_importances=FeatureImportances( [FeatureWeight(*x) for x in zip(names, values)], remaining=np.count_nonzero(coef) - len(indices), ), decision_tree=tree_info, description=DESCRIPTION_DECISION_TREE, estimator=repr(estimator), method='decision tree', )
def test_format_as_dict(): assert format_as_dict( Explanation( estimator='some estimator', targets=[ TargetExplanation('y', feature_weights=FeatureWeights(pos=[ FeatureWeight('a', np.float32(13.0)) ], neg=[])), ], )) == { 'estimator': 'some estimator', 'targets': [ { 'target': 'y', 'feature_weights': { 'pos': [{ 'feature': 'a', 'weight': 13.0, 'std': None, 'value': None }], 'pos_remaining': 0, 'neg': [], 'neg_remaining': 0, }, 'score': None, 'proba': None, 'weighted_spans': None, 'heatmap': None, }, ], 'decision_tree': None, 'description': None, 'error': None, 'feature_importances': None, 'highlight_spaces': None, 'is_regression': False, 'method': None, 'transition_features': None, 'image': None, }
def test_transition_features(): expl = Explanation( estimator='some estimator', targets=[ TargetExplanation('class1', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), TargetExplanation('class2', feature_weights=FeatureWeights( pos=[FeatureWeight('pos', 13, value=1)], neg=[], )), ], transition_features=TransitionFeatureWeights( class_names=['class2', 'class1'], # reverse on purpose coef=np.array([[1.5, 2.5], [3.5, 4.5]]), )) df_dict = format_as_dataframes(expl) assert isinstance(df_dict, dict) assert set(df_dict) == {'targets', 'transition_features'} assert df_dict['targets'].equals(format_as_dataframe(expl.targets)) df = df_dict['transition_features'] print(df) print(format_as_text(expl)) expected = pd.DataFrame([ { 'from': 'class2', 'to': 'class2', 'coef': 1.5 }, { 'from': 'class2', 'to': 'class1', 'coef': 2.5 }, { 'from': 'class1', 'to': 'class2', 'coef': 3.5 }, { 'from': 'class1', 'to': 'class1', 'coef': 4.5 }, ], columns=['from', 'to', 'coef']) assert df.equals(expected) with pytest.warns(UserWarning): single_df = format_as_dataframe(expl) assert single_df.equals(df)
def explain_weights_xgboost(xgb, vec=None, top=20, target_names=None, # ignored targets=None, # ignored feature_names=None, feature_re=None, feature_filter=None, importance_type='gain', ): """ Return an explanation of an XGBoost estimator (via scikit-learn wrapper XGBClassifier or XGBRegressor) as feature importances. See :func:`eli5.explain_weights` for description of ``top``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``target_names`` and ``targets`` parameters are ignored. Parameters ---------- importance_type : str, optional A way to get feature importance. Possible values are: - 'gain' - the average gain of the feature when it is used in trees (default) - 'weight' - the number of times a feature is used to split the data across all trees - 'cover' - the average coverage of the feature when it is used in trees """ coef = _xgb_feature_importances(xgb, importance_type=importance_type) num_features = coef.shape[-1] feature_names = get_feature_names( xgb, vec, feature_names=feature_names, num_features=num_features) feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) if flt_indices is not None: coef = coef[flt_indices] indices = argsort_k_largest_positive(coef, top) names, values = feature_names[indices], coef[indices] return Explanation( feature_importances=FeatureImportances( [FeatureWeight(*x) for x in zip(names, values)], remaining=np.count_nonzero(coef) - len(indices), ), description=DESCRIPTION_XGBOOST, estimator=repr(xgb), method='feature importances', is_regression=isinstance(xgb, XGBRegressor), )
def explain_weights_lightning(estimator, vec=None, top=20, target_names=None, targets=None, feature_names=None, coef_scale=None): """ Return an explanation of a lightning estimator weights """ return Explanation( estimator=repr(estimator), description="Error: estimator %r is not supported" % estimator, )
def explain_prediction_linear_regressor(reg, doc, vec=None, top=None, target_names=None, targets=None, feature_names=None, vectorized=False): """ Explain prediction of a linear regressor. """ vec, feature_names = _handle_vec(reg, doc, vec, vectorized, feature_names) X = _get_X(doc, vec=vec, vectorized=vectorized) score, = reg.predict(X) if has_intercept(reg): X = _add_intercept(X) x, = X res = Explanation( estimator=repr(reg), method='linear model', targets=[], is_regression=True, ) def _weights(label_id): coef = get_coef(reg, label_id) scores = _multiply(x, coef) return get_top_features(feature_names, scores, top) names = get_default_target_names(reg) display_names = get_display_names(names, target_names, targets) if is_multitarget_regressor(reg): for label_id, label in display_names: target_expl = TargetExplanation( target=label, feature_weights=_weights(label_id), score=score[label_id], ) _add_weighted_spans(doc, vec, target_expl) res.targets.append(target_expl) else: target_expl = TargetExplanation( target=display_names[0][1], feature_weights=_weights(0), score=score, ) _add_weighted_spans(doc, vec, target_expl) res.targets.append(target_expl) return res
def explain_weights_sklearn(estimator, vec=None, top=_TOP, target_names=None, targets=None, feature_names=None, coef_scale=None, feature_re=None): """ Return an explanation of an estimator """ return Explanation( estimator=repr(estimator), error="estimator %r is not supported" % estimator, )
def explain_prediction_sklearn(estimator, doc, vec=None, top=None, target_names=None, targets=None, feature_names=None, vectorized=False): """ Return an explanation of a scikit-learn estimator """ return Explanation( estimator=repr(estimator), error="estimator %r is not supported" % estimator, )
def explain_decision_tree( estimator, vec=None, top=_TOP, target_names=None, targets=None, # ignored feature_names=None, feature_re=None, feature_filter=None, **export_graphviz_kwargs): """ Return an explanation of a decision tree. See :func:`eli5.explain_weights` for description of ``top``, ``target_names``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``targets`` parameter is ignored. ``vec`` is a vectorizer instance used to transform raw features to the input of the estimator (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. All other keyword arguments are passed to `sklearn.tree.export_graphviz`_ function. .. _sklearn.tree.export_graphviz: http://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html """ feature_names = get_feature_names(estimator, vec, feature_names=feature_names) tree_feature_names = feature_names feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) feature_importances = get_feature_importances_filtered( estimator.feature_importances_, feature_names, flt_indices, top) export_graphviz_kwargs.setdefault("proportion", True) tree_info = get_tree_info(estimator, feature_names=tree_feature_names, class_names=target_names, **export_graphviz_kwargs) return Explanation( feature_importances=feature_importances, decision_tree=tree_info, description=DESCRIPTION_DECISION_TREE, estimator=repr(estimator), method='decision tree', )
def explain_weights_sklearn_crfsuite(crf, top=20, target_names=None, targets=None, feature_re=None, feature_filter=None): """ Explain sklearn_crfsuite.CRF weights. See :func:`eli5.explain_weights` for description of ``top``, ``target_names``, ``targets``, ``feature_re`` and ``feature_filter`` parameters. """ feature_names = np.array(crf.attributes_) state_coef = crf_state_coef(crf).todense().A transition_coef = crf_transition_coef(crf) if feature_filter is not None or feature_re is not None: state_feature_names, flt_indices = ( FeatureNames(feature_names).handle_filter(feature_filter, feature_re)) state_feature_names = np.array(state_feature_names.feature_names) state_coef = state_coef[:, flt_indices] else: state_feature_names = feature_names def _features(label_id): return get_top_features(state_feature_names, state_coef[label_id], top) if targets is None: targets = sorted_for_ner(crf.classes_) display_names = get_target_display_names(crf.classes_, target_names, targets) indices, names = zip(*display_names) transition_coef = filter_transition_coefs(transition_coef, indices) return Explanation( targets=[ TargetExplanation( target=label, feature_weights=_features(label_id) ) for label_id, label in zip(indices, names) ], transition_features=TransitionFeatureWeights( class_names=names, coef=transition_coef, ), estimator=repr(crf), method='CRF', )
def explain_prediction_lightning(estimator, doc, vec=None, top=None, target_names=None, targets=None, feature_names=None, vectorized=False, coef_scale=None): """ Return an explanation of a lightning estimator predictions """ return Explanation( estimator=repr(estimator), description="Error: estimator %r is not supported" % estimator, )
def explain_decision_tree( clf, vec=None, top=_TOP, target_names=None, targets=None, # ignored feature_names=None, feature_re=None, **export_graphviz_kwargs): """ Return an explanation of a decision tree classifier in the following format (compatible with random forest explanations):: Explanation( estimator="<classifier repr>", method="<interpretation method>", description="<human readable description>", decision_tree={...tree information}, feature_importances=[ FeatureWeight(feature_name, importance, std_deviation), ... ] ) """ feature_names = get_feature_names(clf, vec, feature_names=feature_names) coef = clf.feature_importances_ tree_feature_names = feature_names if feature_re is not None: feature_names, flt_indices = feature_names.filtered_by_re(feature_re) coef = coef[flt_indices] indices = argsort_k_largest(coef, top) names, values = feature_names[indices], coef[indices] std = np.zeros_like(values) export_graphviz_kwargs.setdefault("proportion", True) tree_info = get_tree_info(clf, feature_names=tree_feature_names, class_names=target_names, **export_graphviz_kwargs) return Explanation( feature_importances=[ FeatureWeight(*x) for x in zip(names, values, std) ], decision_tree=tree_info, description=DESCRIPTION_DECISION_TREE, estimator=repr(clf), method='decision tree', )
def explain_prediction_sklearn_not_supported(estimator, doc, vec=None, top=None, top_targets=None, target_names=None, targets=None, feature_names=None, feature_re=None, feature_filter=None, vectorized=False): return Explanation( estimator=repr(estimator), error="estimator %r is not supported" % estimator, )
def explain_rf_feature_importance( estimator, vec=None, top=_TOP, target_names=None, # ignored targets=None, # ignored feature_names=None, feature_re=None, feature_filter=None, ): """ Return an explanation of a tree-based ensemble estimator. See :func:`eli5.explain_weights` for description of ``top``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``target_names`` and ``targets`` parameters are ignored. ``vec`` is a vectorizer instance used to transform raw features to the input of the estimator (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. """ feature_names = get_feature_names(estimator, vec, feature_names=feature_names) coef = estimator.feature_importances_ trees = np.array(estimator.estimators_).ravel() coef_std = np.std([tree.feature_importances_ for tree in trees], axis=0) feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) if flt_indices is not None: coef = coef[flt_indices] coef_std = coef_std[flt_indices] indices = argsort_k_largest_positive(coef, top) names, values, std = feature_names[indices], coef[indices], coef_std[ indices] return Explanation( feature_importances=FeatureImportances( [FeatureWeight(*x) for x in zip(names, values, std)], remaining=np.count_nonzero(coef) - len(indices), ), description=DESCRIPTION_RANDOM_FOREST, estimator=repr(estimator), method='feature importances', )
def get_decision_path_explanation(estimator, doc, vec, vectorized, original_display_names, target_names, targets, top_targets, is_regression, is_multiclass, proba, get_score_feature_weights): display_names = get_target_display_names( original_display_names, target_names, targets, top_targets, proba) explanation = Explanation( estimator=repr(estimator), method='decision paths', description={ (False, False): DESCRIPTION_CLF_BINARY, (False, True): DESCRIPTION_CLF_MULTICLASS, (True, False): DESCRIPTION_REGRESSION, }[is_regression, is_multiclass], is_regression=is_regression, targets=[], ) if is_multiclass: for label_id, label in display_names: score, feature_weights = get_score_feature_weights(label_id) target_expl = TargetExplanation( target=label, feature_weights=feature_weights, score=score, proba=proba[label_id] if proba is not None else None, ) add_weighted_spans(doc, vec, vectorized, target_expl) explanation.targets.append(target_expl) else: score, feature_weights = get_score_feature_weights(0) target_expl = TargetExplanation( target=display_names[-1][1], feature_weights=feature_weights, score=score, proba=proba[1] if proba is not None else None, ) add_weighted_spans(doc, vec, vectorized, target_expl) explanation.targets.append(target_expl) return explanation
def explain_rf_feature_importance( clf, vec=None, top=_TOP, target_names=None, # ignored targets=None, # ignored feature_names=None, feature_re=None): """ Return an explanation of a tree-based ensemble classifier in the following format:: Explanation( estimator="<classifier repr>", method="<interpretation method>", description="<human readable description>", feature_importances=[ FeatureWeight(feature_name, importance, std_deviation), ... ] ) """ feature_names = get_feature_names(clf, vec, feature_names=feature_names) coef = clf.feature_importances_ trees = np.array(clf.estimators_).ravel() coef_std = np.std([tree.feature_importances_ for tree in trees], axis=0) if feature_re is not None: feature_names, flt_indices = feature_names.filtered_by_re(feature_re) coef = coef[flt_indices] coef_std = coef_std[flt_indices] indices = argsort_k_largest(coef, top) names, values, std = feature_names[indices], coef[indices], coef_std[ indices] return Explanation( feature_importances=[ FeatureWeight(*x) for x in zip(names, values, std) ], description=DESCRIPTION_RANDOM_FOREST, estimator=repr(clf), method='feature importances', )
def explain_predictions(self, docs, top=30): if not isinstance(self.clf, XGBClassifier): raise NotImplementedError booster = self.clf.booster() xgb_feature_names = {f: i for i, f in enumerate(booster.feature_names)} feature_names = get_feature_names(self.clf, self.vec, num_features=len(xgb_feature_names)) feature_names.bias_name = '<BIAS>' X = self.vec.transform(docs) X = X.tocsc() dmatrix = DMatrix(X, missing=self.clf.missing) leaf_ids = booster.predict(dmatrix, pred_leaf=True) tree_dumps = booster.get_dump(with_stats=True) docs_weights = [] for i, _leaf_ids in enumerate(leaf_ids): all_weights = _target_feature_weights( _leaf_ids, tree_dumps, feature_names=feature_names, xgb_feature_names=xgb_feature_names)[1] weights = np.zeros_like(all_weights) idx = X[i].nonzero()[1] bias_idx = feature_names.bias_idx weights[idx] = all_weights[idx] weights[bias_idx] = all_weights[bias_idx] docs_weights.append(weights) weights = np.mean(docs_weights, axis=0) feature_weights = get_top_features(feature_names=np.array( [_prettify_feature(f) for f in feature_names]), coef=weights, top=top) return Explanation( estimator=type(self.clf).__name__, targets=[TargetExplanation('y', feature_weights=feature_weights)], )
def explain_prediction_tree_regressor(reg, doc, vec=None, top=None, top_targets=None, target_names=None, targets=None, feature_names=None, feature_re=None, feature_filter=None, vectorized=False): """ Explain prediction of a tree regressor. See :func:`eli5.explain_prediction` for description of ``top``, ``top_targets``, ``target_names``, ``targets``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``vec`` is a vectorizer instance used to transform raw features to the input of the regressor ``reg`` (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. ``vectorized`` is a flag which tells eli5 if ``doc`` should be passed through ``vec`` or not. By default it is False, meaning that if ``vec`` is not None, ``vec.transform([doc])`` is passed to the regressor. Set it to True if you're passing ``vec``, but ``doc`` is already vectorized. Method for determining feature importances follows an idea from http://blog.datadive.net/interpreting-random-forests/. Feature weights are calculated by following decision paths in trees of an ensemble (or a single tree for DecisionTreeRegressor). Each node of the tree has an output score, and contribution of a feature on the decision path is how much the score changes from parent to child. Weights of all features sum to the output score of the estimator. """ vec, feature_names = handle_vec(reg, doc, vec, vectorized, feature_names) X = get_X(doc, vec=vec, vectorized=vectorized) if feature_names.bias_name is None: # Tree estimators do not have an intercept, but here we interpret # them as having an intercept feature_names.bias_name = '<BIAS>' score, = reg.predict(X) num_targets = getattr(reg, 'n_outputs_', 1) is_multitarget = num_targets > 1 feature_weights = _trees_feature_weights(reg, X, feature_names, num_targets) x = get_X0(add_intercept(X)) flt_feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re, x) def _weights(label_id, scale=1.0): weights = feature_weights[:, label_id] return get_top_features_filtered(x, flt_feature_names, flt_indices, weights, top, scale) res = Explanation( estimator=repr(reg), method='decision path', description=(DESCRIPTION_TREE_REG_MULTITARGET if is_multitarget else DESCRIPTION_TREE_REG), targets=[], is_regression=True, ) assert res.targets is not None names = get_default_target_names(reg, num_targets=num_targets) display_names = get_target_display_names(names, target_names, targets, top_targets, score) if is_multitarget: for label_id, label in display_names: target_expl = TargetExplanation( target=label, feature_weights=_weights(label_id), score=score[label_id], ) add_weighted_spans(doc, vec, vectorized, target_expl) res.targets.append(target_expl) else: target_expl = TargetExplanation( target=display_names[0][1], feature_weights=_weights(0), score=score, ) add_weighted_spans(doc, vec, vectorized, target_expl) res.targets.append(target_expl) return res