def odd_feature_names(transformer, in_names=None): if in_names is None: from eli5.sklearn.utils import get_feature_names # generate default feature names in_names = get_feature_names(transformer, num_features=transformer.n_features_) # return a list of strings derived from in_names return in_names[1::2]
def _col_tfm_names(transformer, in_names=None): if in_names is None: from eli5.sklearn.utils import get_feature_names # generate default feature names in_names = get_feature_names(transformer, num_features=transformer._n_features) # return a list of strings derived from in_names feature_names = [] for name, trans, column, _ in transformer._iter(fitted=True): if hasattr(transformer, '_df_columns'): if ((not isinstance(column, slice)) and all(isinstance(col, str) for col in column)): names = column else: names = transformer._df_columns[column] else: indices = np.arange(transformer._n_features) names = ['x%d' % i for i in indices[column]] # erm, want to be able to override with in_names maybe??? if trans == 'drop' or (hasattr(column, '__len__') and not len(column)): continue if trans == 'passthrough': feature_names.extend(names) continue feature_names.extend([ name + "__" + f for f in transform_feature_names(trans, in_names=names) ]) return feature_names
def _handle_vec(clf, doc, vec, vectorized, feature_names): if isinstance(vec, HashingVectorizer) and not vectorized: vec = InvertableHashingVectorizer(vec) vec.fit([doc]) if is_invhashing(vec) and feature_names is None: # Explaining predictions does not need coef_scale, # because it is handled by the vectorizer. feature_names = vec.get_feature_names(always_signed=False) feature_names = get_feature_names(clf, vec, feature_names=feature_names) return vec, feature_names
def explain_decision_tree( estimator, vec=None, top=_TOP, target_names=None, targets=None, # ignored feature_names=None, feature_re=None, feature_filter=None, **export_graphviz_kwargs): """ Return an explanation of a decision tree. See :func:`eli5.explain_weights` for description of ``top``, ``target_names``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``targets`` parameter is ignored. ``vec`` is a vectorizer instance used to transform raw features to the input of the estimator (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. All other keyword arguments are passed to `sklearn.tree.export_graphviz`_ function. .. _sklearn.tree.export_graphviz: http://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html """ feature_names = get_feature_names(estimator, vec, feature_names=feature_names) coef = estimator.feature_importances_ tree_feature_names = feature_names feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) if flt_indices is not None: coef = coef[flt_indices] indices = argsort_k_largest_positive(coef, top) names, values = feature_names[indices], coef[indices] export_graphviz_kwargs.setdefault("proportion", True) tree_info = get_tree_info(estimator, feature_names=tree_feature_names, class_names=target_names, **export_graphviz_kwargs) return Explanation( feature_importances=FeatureImportances( [FeatureWeight(*x) for x in zip(names, values)], remaining=np.count_nonzero(coef) - len(indices), ), decision_tree=tree_info, description=DESCRIPTION_DECISION_TREE, estimator=repr(estimator), method='decision tree', )
def explain_weights_pipeline(estimator, feature_names=None, **kwargs): last_estimator = estimator.steps[-1][1] transform_pipeline = Pipeline(estimator.steps[:-1]) if 'vec' in kwargs: feature_names = get_feature_names(feature_names, vec=kwargs.pop('vec')) feature_names = transform_feature_names(transform_pipeline, feature_names) out = explain_weights(last_estimator, feature_names=feature_names, **kwargs) out.estimator = repr(estimator) return out
def explain_weights_xgboost(xgb, vec=None, top=20, target_names=None, # ignored targets=None, # ignored feature_names=None, feature_re=None, feature_filter=None, importance_type='gain', ): """ Return an explanation of an XGBoost estimator (via scikit-learn wrapper XGBClassifier or XGBRegressor) as feature importances. See :func:`eli5.explain_weights` for description of ``top``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``target_names`` and ``targets`` parameters are ignored. Parameters ---------- importance_type : str, optional A way to get feature importance. Possible values are: - 'gain' - the average gain of the feature when it is used in trees (default) - 'weight' - the number of times a feature is used to split the data across all trees - 'cover' - the average coverage of the feature when it is used in trees """ coef = _xgb_feature_importances(xgb, importance_type=importance_type) num_features = coef.shape[-1] feature_names = get_feature_names( xgb, vec, feature_names=feature_names, num_features=num_features) feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) if flt_indices is not None: coef = coef[flt_indices] indices = argsort_k_largest_positive(coef, top) names, values = feature_names[indices], coef[indices] return Explanation( feature_importances=FeatureImportances( [FeatureWeight(*x) for x in zip(names, values)], remaining=np.count_nonzero(coef) - len(indices), ), description=DESCRIPTION_XGBOOST, estimator=repr(xgb), method='feature importances', is_regression=isinstance(xgb, XGBRegressor), )
def explain_decision_tree( estimator, vec=None, top=_TOP, target_names=None, targets=None, # ignored feature_names=None, feature_re=None, feature_filter=None, **export_graphviz_kwargs): """ Return an explanation of a decision tree. See :func:`eli5.explain_weights` for description of ``top``, ``target_names``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``targets`` parameter is ignored. ``vec`` is a vectorizer instance used to transform raw features to the input of the estimator (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. All other keyword arguments are passed to `sklearn.tree.export_graphviz`_ function. .. _sklearn.tree.export_graphviz: http://scikit-learn.org/stable/modules/generated/sklearn.tree.export_graphviz.html """ feature_names = get_feature_names(estimator, vec, feature_names=feature_names) tree_feature_names = feature_names feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) feature_importances = get_feature_importances_filtered( estimator.feature_importances_, feature_names, flt_indices, top) export_graphviz_kwargs.setdefault("proportion", True) tree_info = get_tree_info(estimator, feature_names=tree_feature_names, class_names=target_names, **export_graphviz_kwargs) return Explanation( feature_importances=feature_importances, decision_tree=tree_info, description=DESCRIPTION_DECISION_TREE, estimator=repr(estimator), method='decision tree', )
def explain_decision_tree( clf, vec=None, top=_TOP, target_names=None, targets=None, # ignored feature_names=None, feature_re=None, **export_graphviz_kwargs): """ Return an explanation of a decision tree classifier in the following format (compatible with random forest explanations):: Explanation( estimator="<classifier repr>", method="<interpretation method>", description="<human readable description>", decision_tree={...tree information}, feature_importances=[ FeatureWeight(feature_name, importance, std_deviation), ... ] ) """ feature_names = get_feature_names(clf, vec, feature_names=feature_names) coef = clf.feature_importances_ tree_feature_names = feature_names if feature_re is not None: feature_names, flt_indices = feature_names.filtered_by_re(feature_re) coef = coef[flt_indices] indices = argsort_k_largest(coef, top) names, values = feature_names[indices], coef[indices] std = np.zeros_like(values) export_graphviz_kwargs.setdefault("proportion", True) tree_info = get_tree_info(clf, feature_names=tree_feature_names, class_names=target_names, **export_graphviz_kwargs) return Explanation( feature_importances=[ FeatureWeight(*x) for x in zip(names, values, std) ], decision_tree=tree_info, description=DESCRIPTION_DECISION_TREE, estimator=repr(clf), method='decision tree', )
def explain_rf_feature_importance( estimator, vec=None, top=_TOP, target_names=None, # ignored targets=None, # ignored feature_names=None, feature_re=None, feature_filter=None, ): """ Return an explanation of a tree-based ensemble estimator. See :func:`eli5.explain_weights` for description of ``top``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``target_names`` and ``targets`` parameters are ignored. ``vec`` is a vectorizer instance used to transform raw features to the input of the estimator (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. """ feature_names = get_feature_names(estimator, vec, feature_names=feature_names) coef = estimator.feature_importances_ trees = np.array(estimator.estimators_).ravel() coef_std = np.std([tree.feature_importances_ for tree in trees], axis=0) feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) if flt_indices is not None: coef = coef[flt_indices] coef_std = coef_std[flt_indices] indices = argsort_k_largest_positive(coef, top) names, values, std = feature_names[indices], coef[indices], coef_std[ indices] return Explanation( feature_importances=FeatureImportances( [FeatureWeight(*x) for x in zip(names, values, std)], remaining=np.count_nonzero(coef) - len(indices), ), description=DESCRIPTION_RANDOM_FOREST, estimator=repr(estimator), method='feature importances', )
def test_get_feature_names(): docs = ['hello world', 'hello', 'world'] def _names(*args, **kwargs): return set(get_feature_names(*args, **kwargs)) for y in [[0, 1, 2], [0, 1, 0]]: # multiclass, binary vec = CountVectorizer() X = vec.fit_transform(docs) clf = LogisticRegression() clf.fit(X, y) fnames = get_feature_names(clf, vec) assert isinstance(fnames, FeatureNames) assert repr(fnames) == '<FeatureNames: 2 features with bias>' assert _names(clf, vec) == {'hello', 'world', '<BIAS>'} assert _names(clf, vec, 'B') == {'hello', 'world', 'B'} assert _names(clf) == {'x0', 'x1', '<BIAS>'} assert _names(clf, feature_names=['a', 'b']) == {'a', 'b', '<BIAS>'} assert _names(clf, feature_names=['a', 'b'], bias_name='bias') == {'a', 'b', 'bias'} assert _names(clf, feature_names=np.array(['a', 'b'])) == {'a', 'b', '<BIAS>'} assert _names(clf, feature_names=FeatureNames(['a', 'b' ])) == {'a', 'b', '<BIAS>'} assert _names(clf, feature_names=FeatureNames(n_features=2, unkn_template='F%d')) == { 'F0', 'F1', '<BIAS>' } with pytest.raises(ValueError): get_feature_names(clf, feature_names=['a']) with pytest.raises(ValueError): get_feature_names(clf, feature_names=['a', 'b', 'c']) with pytest.raises(ValueError): get_feature_names(clf, feature_names=FeatureNames(['a', 'b', 'c'])) clf2 = LogisticRegression(fit_intercept=False) clf2.fit(X, y) assert _names(clf2, vec) == {'hello', 'world'} assert _names(clf2, feature_names=['hello', 'world']) == {'hello', 'world'}
def explain_rf_feature_importance( clf, vec=None, top=_TOP, target_names=None, # ignored targets=None, # ignored feature_names=None, feature_re=None): """ Return an explanation of a tree-based ensemble classifier in the following format:: Explanation( estimator="<classifier repr>", method="<interpretation method>", description="<human readable description>", feature_importances=[ FeatureWeight(feature_name, importance, std_deviation), ... ] ) """ feature_names = get_feature_names(clf, vec, feature_names=feature_names) coef = clf.feature_importances_ trees = np.array(clf.estimators_).ravel() coef_std = np.std([tree.feature_importances_ for tree in trees], axis=0) if feature_re is not None: feature_names, flt_indices = feature_names.filtered_by_re(feature_re) coef = coef[flt_indices] coef_std = coef_std[flt_indices] indices = argsort_k_largest(coef, top) names, values, std = feature_names[indices], coef[indices], coef_std[ indices] return Explanation( feature_importances=[ FeatureWeight(*x) for x in zip(names, values, std) ], description=DESCRIPTION_RANDOM_FOREST, estimator=repr(clf), method='feature importances', )
def explain_predictions(self, docs, top=30): if not isinstance(self.clf, XGBClassifier): raise NotImplementedError booster = self.clf.booster() xgb_feature_names = {f: i for i, f in enumerate(booster.feature_names)} feature_names = get_feature_names(self.clf, self.vec, num_features=len(xgb_feature_names)) feature_names.bias_name = '<BIAS>' X = self.vec.transform(docs) X = X.tocsc() dmatrix = DMatrix(X, missing=self.clf.missing) leaf_ids = booster.predict(dmatrix, pred_leaf=True) tree_dumps = booster.get_dump(with_stats=True) docs_weights = [] for i, _leaf_ids in enumerate(leaf_ids): all_weights = _target_feature_weights( _leaf_ids, tree_dumps, feature_names=feature_names, xgb_feature_names=xgb_feature_names)[1] weights = np.zeros_like(all_weights) idx = X[i].nonzero()[1] bias_idx = feature_names.bias_idx weights[idx] = all_weights[idx] weights[bias_idx] = all_weights[bias_idx] docs_weights.append(weights) weights = np.mean(docs_weights, axis=0) feature_weights = get_top_features(feature_names=np.array( [_prettify_feature(f) for f in feature_names]), coef=weights, top=top) return Explanation( estimator=type(self.clf).__name__, targets=[TargetExplanation('y', feature_weights=feature_weights)], )
def _names(*args, **kwargs): return set(get_feature_names(*args, **kwargs))
def test_get_feature_names_1dim_coef(): clf = SGDRegressor(fit_intercept=False, **SGD_KWARGS) X, y = make_regression(n_targets=1, n_features=3) clf.fit(X, y) assert set(get_feature_names(clf)) == {'x0', 'x1', 'x2'}
def explain_linear_regressor_weights( reg, vec=None, top=_TOP, target_names=None, targets=None, feature_names=None, coef_scale=None, feature_re=None, feature_filter=None, ): """ Return an explanation of a linear regressor weights. See :func:`eli5.explain_weights` for description of ``top``, ``target_names``, ``targets``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``vec`` is a vectorizer instance used to transform raw features to the input of the regressor ``reg``; you can pass it instead of ``feature_names``. ``coef_scale`` is a 1D np.ndarray with a scaling coefficient for each feature; coef[i] = coef[i] * coef_scale[i] if coef_scale[i] is not nan. Use it if you want to scale coefficients before displaying them, to take input feature sign or scale in account. """ feature_names, coef_scale = handle_hashing_vec(vec, feature_names, coef_scale) feature_names = get_feature_names(reg, vec, feature_names=feature_names) feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) _extra_caveats = "\n" + HASHING_CAVEATS if is_invhashing(vec) else '' def _features(target_id): coef = get_coef(reg, target_id, scale=coef_scale) if flt_indices is not None: coef = coef[flt_indices] return get_top_features(feature_names, coef, top) display_names = get_target_display_names(get_default_target_names(reg), target_names, targets) if is_multitarget_regressor(reg): return Explanation( targets=[ TargetExplanation(target=target_name, feature_weights=_features(target_id)) for target_id, target_name in display_names ], description=DESCRIPTION_REGRESSION_MULTITARGET + _extra_caveats, estimator=repr(reg), method='linear model', is_regression=True, ) else: return Explanation( targets=[ TargetExplanation( target=display_names[0][1], feature_weights=_features(0), ) ], description=DESCRIPTION_REGRESSION + _extra_caveats, estimator=repr(reg), method='linear model', is_regression=True, )
def explain_linear_classifier_weights( clf, vec=None, top=_TOP, target_names=None, targets=None, feature_names=None, coef_scale=None, feature_re=None, feature_filter=None, ): """ Return an explanation of a linear classifier weights. See :func:`eli5.explain_weights` for description of ``top``, ``target_names``, ``targets``, ``feature_names``, ``feature_re`` and ``feature_filter`` parameters. ``vec`` is a vectorizer instance used to transform raw features to the input of the classifier ``clf`` (e.g. a fitted CountVectorizer instance); you can pass it instead of ``feature_names``. ``coef_scale`` is a 1D np.ndarray with a scaling coefficient for each feature; coef[i] = coef[i] * coef_scale[i] if coef_scale[i] is not nan. Use it if you want to scale coefficients before displaying them, to take input feature sign or scale in account. """ feature_names, coef_scale = handle_hashing_vec(vec, feature_names, coef_scale) feature_names = get_feature_names(clf, vec, feature_names=feature_names) feature_names, flt_indices = feature_names.handle_filter( feature_filter, feature_re) _extra_caveats = "\n" + HASHING_CAVEATS if is_invhashing(vec) else '' def _features(label_id): coef = get_coef(clf, label_id, scale=coef_scale) if flt_indices is not None: coef = coef[flt_indices] return get_top_features(feature_names, coef, top) display_names = get_target_display_names(clf.classes_, target_names, targets) if is_multiclass_classifier(clf): return Explanation( targets=[ TargetExplanation(target=label, feature_weights=_features(label_id)) for label_id, label in display_names ], description=DESCRIPTION_CLF_MULTICLASS + _extra_caveats, estimator=repr(clf), method='linear model', ) else: # for binary classifiers scikit-learn stores a single coefficient # vector, which corresponds to clf.classes_[1]. return Explanation( targets=[ TargetExplanation( target=display_names[1][1], feature_weights=_features(0), ) ], description=DESCRIPTION_CLF_BINARY + _extra_caveats, estimator=repr(clf), method='linear model', )
def explain_linear_regressor_weights(reg, vec=None, top=_TOP, target_names=None, targets=None, feature_names=None, coef_scale=None, feature_re=None): """ Return an explanation of a linear regressor weights in the following format:: Explanation( estimator="<regressor repr>", method="<interpretation method>", description="<human readable description>", targets=[ TargetExplanation( target="<target name>", feature_weights=FeatureWeights( # positive weights pos=[ (feature_name, coefficient), ... ], # negative weights neg=[ (feature_name, coefficient), ... ], # A number of features not shown pos_remaining = <int>, neg_remaining = <int>, # Sum of feature weights not shown # pos_remaining_sum = <float>, # neg_remaining_sum = <float>, ), ), ... ] ) To print it use utilities from eli5.formatters. """ feature_names, coef_scale = handle_hashing_vec(vec, feature_names, coef_scale) feature_names = get_feature_names(reg, vec, feature_names=feature_names) if feature_re is not None: feature_names, flt_indices = feature_names.filtered_by_re(feature_re) _extra_caveats = "\n" + HASHING_CAVEATS if is_invhashing(vec) else '' def _features(target_id): coef = get_coef(reg, target_id, scale=coef_scale) if feature_re is not None: coef = coef[flt_indices] return get_top_features(feature_names, coef, top) display_names = get_display_names(get_default_target_names(reg), target_names, targets) if is_multitarget_regressor(reg): return Explanation( targets=[ TargetExplanation(target=target_name, feature_weights=_features(target_id)) for target_id, target_name in display_names ], description=DESCRIPTION_REGRESSION_MULTITARGET + _extra_caveats, estimator=repr(reg), method='linear model', is_regression=True, ) else: return Explanation( targets=[ TargetExplanation( target=display_names[0][1], feature_weights=_features(0), ) ], description=DESCRIPTION_REGRESSION + _extra_caveats, estimator=repr(reg), method='linear model', is_regression=True, )
def explain_linear_classifier_weights(clf, vec=None, top=_TOP, target_names=None, targets=None, feature_names=None, coef_scale=None, feature_re=None): """ Return an explanation of a linear classifier weights in the following format:: Explanation( estimator="<classifier repr>", method="<interpretation method>", description="<human readable description>", targets=[ TargetExplanation( target="<class name>", feature_weights=FeatureWeights( # positive weights pos=[ (feature_name, coefficient), ... ], # negative weights neg=[ (feature_name, coefficient), ... ], # A number of features not shown pos_remaining = <int>, neg_remaining = <int>, # Sum of feature weights not shown # pos_remaining_sum = <float>, # neg_remaining_sum = <float>, ), ), ... ] ) To print it use utilities from eli5.formatters. """ feature_names, coef_scale = handle_hashing_vec(vec, feature_names, coef_scale) feature_names = get_feature_names(clf, vec, feature_names=feature_names) if feature_re is not None: feature_names, flt_indices = feature_names.filtered_by_re(feature_re) _extra_caveats = "\n" + HASHING_CAVEATS if is_invhashing(vec) else '' def _features(label_id): coef = get_coef(clf, label_id, scale=coef_scale) if feature_re is not None: coef = coef[flt_indices] return get_top_features(feature_names, coef, top) display_names = get_display_names(clf.classes_, target_names, targets) if is_multiclass_classifier(clf): return Explanation( targets=[ TargetExplanation(target=label, feature_weights=_features(label_id)) for label_id, label in display_names ], description=DESCRIPTION_CLF_MULTICLASS + _extra_caveats, estimator=repr(clf), method='linear model', ) else: # for binary classifiers scikit-learn stores a single coefficient # vector, which corresponds to clf.classes_[1]. return Explanation( targets=[ TargetExplanation( target=display_names[1][1], feature_weights=_features(0), ) ], description=DESCRIPTION_CLF_BINARY + _extra_caveats, estimator=repr(clf), method='linear model', )