def feat_importance(X, cont, clf=None, n_estimators=1000, n_splits=10, max_samples=1., num_threads=24, pct_embargo=0., scoring='accuracy', method='SFI', min_w_leaf=0., **kwargs): n_jobs = (-1 if num_threads > 1 else 1) # Build classifiers if clf is None: base_clf = DecisionTreeClassifier(criterion='entropy', max_features=1, class_weight='balanced', min_weight_fraction_leaf=min_w_leaf) clf = BaggingClassifier(base_estimator=base_clf, n_estimators=n_estimators, max_features=1., max_samples=max_samples, oob_score=True, n_jobs=n_jobs) fit_clf = clf.fit(X, cont['bin'], sample_weight=cont['w'].values) if hasattr(fit_clf, 'oob_score_'): oob = fit_clf.oob_score_ else: oob = None if method == 'MDI': imp = feat_imp_MDI(fit_clf, feat_names=X.columns) oos = cv_score(clf, X=X, y=cont['bin'], n_splits=n_splits, sample_weight=cont['w'], t1=cont['t1'], pct_embargo=pct_embargo, scoring=scoring).mean() elif method == 'MDA': imp, oos = feat_imp_MDA(clf, X=X, y=cont['bin'], n_splits=n_splits, sample_weight=cont['w'], t1=cont['t1'], pct_embargo=pct_embargo, scoring=scoring) elif method == 'SFI': cv_gen = PurgedKFold(n_splits=n_splits, t1=cont['t1'], pct_embargo=pct_embargo) oos = cv_score(clf, X=X, y=cont['bin'], sample_weight=cont['w'], scoring=scoring, cv_gen=cv_gen) clf.n_jobs = 1 imp = mp_pandas_obj(aux_feat_imp_SFI, ('feat_names', X.columns), num_threads, clf=clf, X=X, cont=cont, scoring=scoring, cv_gen=cv_gen) return imp, oob, oos
def get_sample_w(t1, num_co_events, close, num_threads=1): """ Snippet 4.10 (page 69) Determination Of Sample Weight By Absolute Return Attribution :param t1: :param num_co_events: :param close: :param num_threads: :return: """ wght = mp_pandas_obj(mp_sample_w, ('molecule', t1.index), num_threads=num_threads, t1=t1, num_co_events=num_co_events, close=close) return wght
def get_sample_tw(t1, num_co_events, num_threads=1): """ Calculate sampling weight with considering some attributes :param timestamps: :param t1: :param num_co_events: :param num_threads: :return: """ weight = mp_pandas_obj(mp_sample_tw, ('molecule', t1.index), num_threads=num_threads, t1=t1, num_co_events=num_co_events) return weight
def get_train_times(train_times, test_times, num_threads=1): """Sample train points without overlapping with test period Params ------ train_times: pd.Series Trainig points with index for initial and values for end time test_times: pd.Series Testing points with index for initial and values for end time num_threads: int, default 1 The number of thrads for multiprocessing Returns ------- pd.Series """ return mp_pandas_obj( mp_train_times, ('molecule', train_times.index), num_threads, train_times=train_times, test_times=test_times)
def avg_active_signals(signals, num_threads=1, timestamps=None): """Average active signals Paramters --------- signals: pd.Series num_threads: 1 timestamps: list, optional Timestamps used for output. When there is not active signal, value will be zero on that point. If not specified, use signals.index Return ------ pd.Series """ if timestamps is None: timestamps = set(signals['t1'].dropna().values) timestamps = list(timestamps.union(set(signals.index.values))) timestamps.sort() out = mp_pandas_obj(mp_avg_active_signals, ('molecule', timestamps), num_threads, signals=signals) return out