def aux_feat_imp_SFI(feat_names, clf, X, cont, scoring, cv_gen): """ Substitution effects can lead us to discard important features that happen to be redundant generally not a problem for prediction, but can lead us to wrong conclusions when we are trying to understand, improve, or simplify a model cross-section predictive-importance (out-ofsample, OOS) method It computes the OOS performance score of each feature in isolation - can be applied to any classifier, not only tree-based classifiers - not limited to accuracy as the sole performance score - Unlike MDI and MDA, no substitution effects take place, since only one feature is taken into consideration at a time """ imp = pd.DataFrame(columns=['mean', 'std']) for feat_name in feat_names: # Steps through each of the features and gets a cross validation score with just that feature as x-input print("Testing feature: {}".format(feat_name)) scores = cv_score(clf, X=X[[feat_name]], y=cont['bin'], sample_weight=cont['w'], scoring=scoring, cv_gen=cv_gen) imp.loc[feat_name, 'mean'] = scores.mean() imp.loc[feat_name, 'std'] = scores.std() * np.sqrt(scores.shape[0]) print("Finished feature: {}".format(feat_name)) print("completed aux imp SFI") return imp
def feat_importance(X, cont, clf=None, n_estimators=1000, n_splits=10, max_samples=1., num_threads=24, pct_embargo=0., scoring='accuracy', method='SFI', min_w_leaf=0., **kwargs): """ takes in a classifier and type of importance score and runs through the cross validation score """ n_jobs = (-1 if num_threads > 1 else 1) # Build classifiers if clf is None: base_clf = DecisionTreeClassifier(criterion='entropy', max_features=1, class_weight='balanced', min_weight_fraction_leaf=min_w_leaf) clf = BaggingClassifier(base_estimator=base_clf, n_estimators=n_estimators, max_features=1., max_samples=max_samples, oob_score=True, n_jobs=n_jobs) fit_clf = clf.fit(X, cont['bin'], sample_weight=cont['w'].values) if hasattr(fit_clf, 'oob_score_'): oob = fit_clf.oob_score_ else: oob = None # cv score will use true out of sample training sets if method == 'MDI': imp = feat_imp_MDI(fit_clf, feat_names=X.columns) oos = cv_score(clf, X=X, y=cont['bin'], n_splits=n_splits, sample_weight=cont['w'], t1=cont['t1'], pct_embargo=pct_embargo, scoring=scoring).mean() elif method == 'MDA': imp, oos = feat_imp_MDA(clf, X=X, y=cont['bin'], n_splits=n_splits, sample_weight=cont['w'], t1=cont['t1'], pct_embargo=pct_embargo, scoring=scoring) elif method == 'SFI': cv_gen = PurgedKFold(n_splits=n_splits, t1=cont['t1'], pct_embargo=pct_embargo) oos = cv_score(clf, X=X, y=cont['bin'], sample_weight=cont['w'], scoring=scoring, cv_gen=cv_gen) clf.n_jobs = 24 imp = mp_pandas_obj(aux_feat_imp_SFI, ('feat_names', X.columns), num_threads, clf=clf, X=X, cont=cont, scoring=scoring, cv_gen=cv_gen) return imp, oob, oos