コード例 #1
0
def find_best_n_features_mic(n=8, out_path=''):
    # 计算MIC
    mine = MINE(alpha=0.6, c=15, est="mic_approx")
    mic_all = []
    for i in range(x.shape[1]):
        xi = x[:, i]
        mine.compute_score(xi, y)
        mic_all.append(mine.mic())

    # 找出8个最大的
    best_n = []
    best_n_mic = []
    for i in range(n):
        best_position = np.nanargmax(mic_all)
        best_n.append(best_position)
        best_n_mic.append(copy.deepcopy(mic_all[best_position]))
        mic_all[best_position] = np.nan

    print('Found', n, 'features with largest MIC, whose positions are:')
    print(best_n)
    print()
    print('The MIC of these features are:')
    print(best_n_mic)
    print()

    best_features = x[:, best_n]
    print('Shape of features selected:', best_features.shape)
    best_features_with_label = pd.DataFrame(
        np.concatenate([best_features, y.reshape(len(y), 1)], axis=1))

    out_path = out_path + 'mic_best_' + str(n) + '.csv'
    best_features_with_label.to_csv(out_path, header=None, index=None)
コード例 #2
0
def find_best_n_features_relief(n=8, out_path=''):

    relief_all = calculate_relief(x, y)

    # 找出8个最大的
    best_n = []
    best_n_ref = []
    for i in range(n):
        best_position = np.nanargmax(relief_all)
        best_n.append(best_position)
        best_n_ref.append(copy.deepcopy(relief_all[best_position]))
        relief_all[best_position] = np.nan

    print('Found', n,
          'features with largest Relief Statistics, whose positions are:')
    print(best_n)
    print()
    print('The Relief Statistics of these features are:')
    print(best_n_ref)
    print()

    best_features = x[:, best_n]
    print('Shape of features selected:', best_features.shape)
    best_features_with_label = pd.DataFrame(
        np.concatenate([best_features, y.reshape(len(y), 1)], axis=1))

    out_path = out_path + 'relief_best_' + str(n) + '.csv'
    best_features_with_label.to_csv(out_path, header=None, index=None)
コード例 #3
0
def find_best_n_features_ridge(n=8, out_path=''):

    ridge = LogisticRegression(penalty='l2', class_weight='balanced')

    params_tuned = {'C': [1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100, 180, 190,
                          200, 220, 240, 250, 260, 300, 500, 1000, 2000]}

    clf = GridSearchCV(ridge, params_tuned, scoring='accuracy', cv=5)
    clf.fit(x, y)
    print('During Ridge, the penalty parameter alpha is set as', clf.best_params_['C'])
    clf.best_estimator_.fit(x, y)
    clf.best_estimator_.coef_ = np.abs(clf.best_estimator_.coef_)

    best_n = []
    # best_n_coef = []
    for i in range(n):
        best_position = np.nanargmax(clf.best_estimator_.coef_)
        best_n.append(best_position)
        # best_n_coef.append(copy.deepcopy(clf.best_estimator_.coef_[:, best_position][0]))
        clf.best_estimator_.coef_[:, best_position] = np.nan

    print('Selected Features:', best_n)
    best_features = x[:, best_n]
    print('Shape of features selected:', best_features.shape)
    best_features_with_label = pd.DataFrame(np.concatenate([best_features, y.reshape(len(y), 1)], axis=1))

    out_path = out_path + 'ridge_best_' + str(n) + '.csv'
    best_features_with_label.to_csv(out_path, header=None, index=None)
コード例 #4
0
def find_best_n_features_lasso(n=8, out_path='', max_iter=100):
    def find_proper_c(n, low, up, max_iter=100):
        c_now = (low + up) / 2
        lasso = LogisticRegression(penalty='l1',
                                   class_weight='balanced',
                                   C=c_now)
        lasso.fit(x, y)
        # 用二分法缩小范围,直到最后留下刚好n个特征
        count = 0
        fea_num = np.sum(lasso.coef_ > 0.0)
        while count < max_iter and (fea_num > n * 1.05 or fea_num < n):
            if fea_num > n:
                up = (low + up) / 2
            elif fea_num < n:
                low = (low + up) / 2
            c_now = (low + up) / 2
            lasso = LogisticRegression(penalty='l1',
                                       class_weight='balanced',
                                       C=c_now)
            lasso.fit(x, y)
            fea_num = np.sum(lasso.coef_ > 0.0)
            count += 1
            # print(c_now, ':', fea_num)
        return c_now, count

    low = 0
    up = 100000
    c_now, count = find_proper_c(n, low, up, max_iter)
    retry = 1
    while count == max_iter and retry <= 10:
        print('Didn\'t find proper C within', low, 'and', up, ', retrying',
              retry)
        low = up
        up = up * 10
        c_now, count = find_proper_c(low, up, max_iter)
        retry += 1

    if retry == 11:
        print('Cound not find proper C, please try a lower dimension.')
        return

    lasso = LogisticRegression(penalty='l1', class_weight='balanced', C=c_now)
    lasso.fit(x, y)
    best_features = x[:, (lasso.coef_ > 0.0)[0]]
    if best_features.shape[1] > n:
        best_n = []
        for i in range(n):
            best_position = np.nanargmax(lasso.coef_)
            best_n.append(best_position)
            lasso.coef_[:, best_position] = np.nan
        best_features = x[:, best_n]

    print('Shape of features selected:', best_features.shape)
    best_features_with_label = pd.DataFrame(
        np.concatenate([best_features, y.reshape(len(y), 1)], axis=1))

    out_path = out_path + 'lasso_best_' + str(n) + '.csv'
    best_features_with_label.to_csv(out_path, header=None, index=None)