예제 #1
0
def feat_importance(X, cont, clf=None, n_estimators=1000, n_splits=10, max_samples=1.,
                    num_threads=24, pct_embargo=0., scoring='accuracy',
                    method='SFI', min_w_leaf=0., **kwargs):
    n_jobs = (-1 if num_threads > 1 else 1)
    # Build classifiers
    if clf is None:
        base_clf = DecisionTreeClassifier(criterion='entropy', max_features=1,
                                          class_weight='balanced',
                                          min_weight_fraction_leaf=min_w_leaf)
        clf = BaggingClassifier(base_estimator=base_clf, n_estimators=n_estimators,
                                max_features=1., max_samples=max_samples,
                                oob_score=True, n_jobs=n_jobs)
    fit_clf = clf.fit(X, cont['bin'], sample_weight=cont['w'].values)
    if hasattr(fit_clf, 'oob_score_'):
        oob = fit_clf.oob_score_
    else:
        oob = None
    if method == 'MDI':
        imp = feat_imp_MDI(fit_clf, feat_names=X.columns)
        oos = cv_score(clf, X=X, y=cont['bin'], n_splits=n_splits,
                       sample_weight=cont['w'], t1=cont['t1'],
                       pct_embargo=pct_embargo, scoring=scoring).mean()
    elif method == 'MDA':
        imp, oos = feat_imp_MDA(clf, X=X, y=cont['bin'], n_splits=n_splits,
                                sample_weight=cont['w'], t1=cont['t1'],
                                pct_embargo=pct_embargo, scoring=scoring)
    elif method == 'SFI':
        cv_gen = PurgedKFold(n_splits=n_splits, t1=cont['t1'], pct_embargo=pct_embargo)
        oos = cv_score(clf, X=X, y=cont['bin'], sample_weight=cont['w'],
                       scoring=scoring, cv_gen=cv_gen)
        clf.n_jobs = 1
        imp = mp_pandas_obj(aux_feat_imp_SFI, ('feat_names', X.columns),
                            num_threads, clf=clf, X=X, cont=cont,
                            scoring=scoring, cv_gen=cv_gen)
    return imp, oob, oos
예제 #2
0
def get_sample_w(t1, num_co_events, close, num_threads=1):
    """
    Snippet 4.10 (page 69) Determination Of Sample Weight By Absolute Return Attribution

    :param t1:
    :param num_co_events:
    :param close:
    :param num_threads:
    :return:
    """

    wght = mp_pandas_obj(mp_sample_w, ('molecule', t1.index), num_threads=num_threads,
                       t1=t1, num_co_events=num_co_events, close=close)

    return wght
예제 #3
0
def get_sample_tw(t1, num_co_events, num_threads=1):
    """
    Calculate sampling weight with considering some attributes

    :param timestamps:
    :param t1:
    :param num_co_events:
    :param num_threads:
    :return:
    """

    weight = mp_pandas_obj(mp_sample_tw, ('molecule', t1.index), num_threads=num_threads,
                           t1=t1, num_co_events=num_co_events)

    return weight
예제 #4
0
def get_train_times(train_times, test_times, num_threads=1):
    """Sample train points without overlapping with test period

    Params
    ------
    train_times: pd.Series
        Trainig points with index for initial and values for end time
    test_times: pd.Series
        Testing points with index for initial and values for end time
    num_threads: int, default 1
        The number of thrads for multiprocessing

    Returns
    -------
    pd.Series
    """
    return mp_pandas_obj(
        mp_train_times, ('molecule', train_times.index),
        num_threads,
        train_times=train_times,
        test_times=test_times)
예제 #5
0
def avg_active_signals(signals, num_threads=1, timestamps=None):
    """Average active signals

    Paramters
    ---------
    signals: pd.Series
    num_threads: 1
    timestamps: list, optional
        Timestamps used for output. When there is not active signal,
        value will be zero on that point. If not specified, use signals.index

    Return
    ------
    pd.Series
    """
    if timestamps is None:
        timestamps = set(signals['t1'].dropna().values)
        timestamps = list(timestamps.union(set(signals.index.values)))
        timestamps.sort()
    out = mp_pandas_obj(mp_avg_active_signals, ('molecule', timestamps),
                        num_threads,
                        signals=signals)
    return out