Beispiel #1
0
def fs_continuous(X, y, method):
    """
    All 4 methods are implemented, but for Boruta and MIFS the method is over-
    riden and set to L1.
    """
    n, p = X.shape
    if method == 'Boruta':
        rf = RandomForestRegressor(n_jobs=-1)
        Boruta = boruta.BorutaPy(rf, n_estimators='auto')
        Boruta.fit(X, y)
        selected = np.where(Boruta.support_)[0]
    elif method == 'JMI':
        MIFS = mifs.MutualInformationFeatureSelector(method='JMI',
                                                     categorical=False)
        MIFS.fit(X, y)
        selected = np.where(MIFS.support_)[0]
    elif method == 'L1':
        lasso = LassoCV(n_jobs=-1, normalize=False)
        sfm = SelectFromModel(lasso)
        sfm.fit(X, y)
        selected = sfm.transform(np.arange(p).reshape(1, -1))[0]
    elif method == 'FDR':
        FDR = fs.SelectFdr(fs.f_regression, .05)
        FDR.fit(X, y)
        selected = FDR.transform(np.arange(p).reshape(1, -1))[0]
    return selected
def run_boruta(X, y, p):
    rf = RandomForestClassifier(n_jobs=-1,
                                class_weight='balanced',
                                max_depth=5)
    feat_selector = boruta.BorutaPy(rf,
                                    n_estimators='auto',
                                    random_state=1,
                                    perc=p)
    feat_selector.fit(X, y)
    chosen = []
    for i, value in enumerate(feat_selector.support_):
        if value:
            chosen.append(i)
    return chosen
Beispiel #3
0
def run_boruta(X, y, p):
    import time
    t = time.time()
    rf = RandomForestClassifier(n_jobs=-1,
                                class_weight='balanced',
                                max_depth=5)
    feat_selector = boruta.BorutaPy(rf,
                                    n_estimators='auto',
                                    random_state=1,
                                    perc=p)
    feat_selector.fit(X, y)
    chosen = []
    for i, value in enumerate(feat_selector.support_):
        if value:
            chosen.append(i)
    print('RUNNING BORUTA TIME: {} min'.format((time.time() - t) / 60))
    return chosen
Beispiel #4
0
def do_fs(X, y, method):
    s, f = X.shape
    y_test = np.arange(f).reshape(1, -1)
    if method == "fdr":
        sel = fs.SelectFdr(fs.f_classif, .05).fit(X, y).transform(y_test)[0]
    elif method == "l1svc":
        sel = lsvc_cv.recursive_lsvc_cv(X, y, -3, 3, 7)
    elif method == "boruta":
        rf = RandomForestClassifier(n_jobs=-1)
        b = boruta.BorutaPy(rf, n_estimators='auto')
        b.fit(X, y)
        sel = np.where(b.support_)[0]
    elif method == "jmi":
        MIFS = mifs.MutualInformationFeatureSelector(method='JMI')
        MIFS.fit(X, y)
        sel = np.where(MIFS.support_)[0]
    return sel
Beispiel #5
0
def fs_categorical(X, y, method):
    n, p = X.shape
    selected = []
    if method == 'Boruta':
        rf = RandomForestClassifier(n_jobs=-1)
        Boruta = boruta.BorutaPy(rf, n_estimators='auto')
        Boruta.fit(X, y)
        selected = np.where(Boruta.support_)[0]
    elif method == 'JMI':
        MIFS = mifs.MutualInformationFeatureSelector(method='JMI')
        MIFS.fit(X, y)
        selected = np.where(MIFS.support_)[0]
    elif method == 'L1':
        selected = lsvc_cv.recursive_lsvc_cv(X, y, -3, 3, 7)
    elif method == 'FDR':
        FDR = fs.SelectFdr(fs.f_classif, .05)
        FDR.fit(X, y)
        selected = FDR.transform(np.arange(p).reshape(1, -1))[0]
    return selected
Beispiel #6
0
def borutabench(best_params_rf, X, y, random_state, prefix=""):
    bor = boruta.BorutaPy(model(params=best_params_rf, random_state=random_state))
    bor.fit(X, y)
    plot_class_list([bor.support_])
    plt.title("Frequency of inclusion in AllRel Set (Boruta)")
    plt.savefig(PATH / (prefix + "freqimpsAR.pdf"))
Beispiel #7
0
def do_fs(X, y):
    s, f = X.shape
    y_test = np.arange(f).reshape(1, -1)

    # --------------------------------------------------------------
    # UNIVARIATE FEATURE SELECTION
    # percentile - take the top10% of features
    sel_uni_perc = fs.SelectPercentile(fs.f_classif,
                                       10).fit(X, y).transform(y_test)[0]

    # fdr - minimize false discovery rate at alpha = .05
    sel_uni_fdr = fs.SelectFdr(fs.f_classif, .05).fit(X,
                                                      y).transform(y_test)[0]

    # --------------------------------------------------------------
    # RFECV
    # do a cross-validated grid search for the optimal C
    gridC = {'C': np.logspace(-6, 3, 10)}
    svc = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=1e-4)
    grid_cv = GridSearchCV(svc, gridC, scoring='accuracy', n_jobs=-1)
    grid_cv.fit(X, y)

    # set the optimal C
    # adjust for the smaller training sample size, due to cross validation
    # http://scikit-learn.org/stable/auto_examples/svm/plot_svm_scale_c.html
    cv_num = 3
    train_size = 1 - 1 / float(cv_num)
    adjust_c = float(s * train_size)
    svc.set_params(C=grid_cv.best_params_['C'] * adjust_c)
    # do a stratified 3 fold cross-validated recursive feature elimination,
    # with 1% of the worst feautres removed each round

    rfecv = fs.RFECV(estimator=svc, step=.01, cv=cv_num, scoring='accuracy')
    rfecv.fit(X, y)
    sel_rfecv = rfecv.transform(y_test)[0]

    # --------------------------------------------------------------
    # L1 SVC
    sel_lsvc = lsvc_cv.recursive_lsvc_cv(X, y, -3, 3, 7)

    # --------------------------------------------------------------
    # STABILITY SELECTION
    rlr = RandomizedLogisticRegression(n_resampling=1000,
                                       C=np.logspace(-2, 2, 5),
                                       selection_threshold=0.7,
                                       sample_fraction=0.5)
    sel_rlr = rlr.fit(X, y).transform(y_test)[0]

    # --------------------------------------------------------------
    # BORUTA
    rf = RandomForestClassifier(n_jobs=-1)
    b = boruta.BorutaPy(rf, n_estimators='auto')
    b.fit(X, y)
    sel_b_rf = np.where(b.support_)[0]

    # --------------------------------------------------------------
    # JMI
    MIFS = mifs.MutualInformationFeatureSelector(method='JMI')
    MIFS.fit(X, y)
    sel_jmi = np.where(MIFS.support_)[0]

    return (sel_uni_perc, sel_uni_fdr, sel_rfecv, sel_lsvc, sel_rlr, sel_b_rf,
            sel_jmi)