def correlation_filter():
    '''
    1.2、相关性过滤
    我们希望选出与标签相关且有意义的特征,有三种常用的方法来评判特征与标签之间的相关性:卡方,F检验,互信息。
    :return:
    '''
    ## 导入手写数字识别数据集
    data = pd.read_csv("./digit recognizor.csv")
    X = data.iloc[:, 1:]
    y = data.iloc[:, 0]
    print(X.shape)
    ## 卡方过滤:计算每个非负特征和标签之间的卡方统计量,并依照卡方统计量由高到低为特征排名。
    # 删除小于所有特征中位数的方差的特征,删除后剩余一半特征
    X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
    print(X_fsvar.shape)
    # 假设在这里需要300个特征
    X_fschi = SelectKBest(chi2, k=300).fit_transform(X_fsvar, y)
    print(X_fschi.shape)
    print(
        cross_val_score(RFC(n_estimators=10, random_state=0), X_fschi, y,
                        cv=5).mean())
    # 学习超参数K
    # score = []
    # for i in range(390, 200, -10):
    #     X_fschi = SelectKBest(chi2, k=i).fit_transform(X_fsvar, y)
    #     once = cross_val_score(RFC(n_estimators=10, random_state=0), X_fschi, y, cv=5).mean()
    #     score.append(once)
    # plt.plot(range(390, 200, -10), score)
    # plt.show() # 图像一直上升
    # 根据卡方值和P值确定K值
    chivalue, pvalues_chi = chi2(X_fsvar, y)
    print(chivalue, pvalues_chi)
    # k取多少?我们想要消除所有p值大于设定值,比如0.05或0.01的特征:
    k = chivalue.shape[0] - (pvalues_chi > 0.05).sum()
    print(k)  # 392
    ## F检验,又称ANOVA,方差齐性检验,是用来捕捉每个特征与标签之间的线性关系的过滤方法。
    F, pvalues_f = f_classif(X_fsvar, y)
    print(F, pvalues_f)
    k = F.shape[0] - (pvalues_f > 0.05).sum()
    print(k)  # 392
    ## 互信息法,它是用来捕捉每个特征与标签之间的任意关系(包括线性和非线性关系)的过滤方法。
    # 它返回“每个特征与目标之间的互信息量的估计”,这个估计量在[0,1]之间取值,为0则表示两个变量独立,为1则表示两个变量完全相关。
    result = MIC(X_fsvar, y)
    k = result.shape[0] - sum(result <= 0)
    print(k)  # 392
def aveMI(X, Y):
    MI = MIC(X, Y)
    return np.nanmean(MI)
# F检验:
from sklearn.feature_selection import f_classif

F, pvalues_f = f_classif(X_fsvar, y)
k = F.shape[0] - (pvalues_f > 0.05).sum()

X_fsF = SelectKBest(f_classif, k=392).fit_transform(X_fsvar, y)
cross_val_score(RFC(n_estimators=10, random_state=0), X_fsF, y, cv=5).mean()

# In[]:
# 互信息:
# '''
# 消耗大
from sklearn.feature_selection import mutual_info_classif as MIC

result = MIC(X_fsvar, y)
k = result.shape[0] - sum(result <= 0)  # 392
# In[]:
X_fsmic = SelectKBest(MIC, k=392).fit_transform(X_fsvar, y)
cross_val_score(RFC(n_estimators=10, random_state=0), X_fsmic, y, cv=5).mean()
# '''

# In[]:
# Embedded嵌入法:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as RFC
import numpy as np
import matplotlib.pyplot as plt

# 随机森林实例化
RFC_ = RFC(n_estimators=10, random_state=0)
import matplotlib.pyplot as plt

file = ZipFile('./digit recognizor.zip')
f = file.open('digit recognizor.csv')
df = pd.read_csv(f)
f.close()
file.close()

df.info()
x = df.iloc[:, 1:]
y = df.iloc[:, 0]
selector = VarianceThreshold(np.median(x.var().values))  # 先过滤一半特征
result = selector.fit_transform(x)
print(df.shape, result.shape)

tmp = MIC(result, y)
# 互信息量=0则独立,为1则相关,
# 这里取的意思是和标签是否独立或相关,与结果标签独立则说明该特征是无效特征
k = tmp.shape[0] - sum(tmp <= 0)
result2 = SelectKBest(MIC, k=k).fit_transform(result, y)  # 按照卡方值过滤
print(result2.shape)

# 画出特征数量和精确度的图像
score = []
r = range(350, 250, -10)
for i in r:
    result2 = SelectKBest(MIC, k=i).fit_transform(result, y)
    score.append(
        cross_val_score(RandomForestClassifier(n_estimators=10,
                                               random_state=0),
                        result2,
Exemple #5
0
y_test = test.iloc[:, 1]
print(X_test.shape)

scaler = preprocessing.StandardScaler().fit(X)
X_data_transformed = scaler.transform(X)
X_data_transformed = pd.DataFrame(X_data_transformed)
X_data_transformed.columns = X.columns
X_data = X_data_transformed
scaler = preprocessing.StandardScaler().fit(X_test)
X_test_transformed = scaler.transform(X_test)
X_test_transformed = pd.DataFrame(X_test_transformed)
X_test_transformed.columns = X_test.columns
X_test = X_test_transformed
#
# ################ MIC ##########
result = MIC(X_data, y_data, random_state=100)
k = result.shape[0] - sum(result <= 0)
Select = SelectKBest(MIC, k=k)
Select.fit(X_data, y_data)
X_new = Select.transform(X_data)
# X_new = SelectKBest(chi2,k=131).fit_transform(X_data, y_data)
# X_data=X_new
print(X_new.shape)

X = X_data.T
X_data = X[Select.get_support()].T
print(X_data.shape)
X1 = X_test.T
X_test = X1[Select.get_support()].T
print(X_test.shape)
# pvalue < 0.05(或者0.01) , 拒绝原假设(特征X与标签Y独立) , 接受备用假设(X与Y相关)

# independentColumnsNo_of_chi = (pValue > 0.05).sum()
# k_best_chi = len(chiValue) - independentColumnsNo_of_chi
# print(f'No. of ignored columns = {independentColumnsNo_of_chi}')
# print(f'Best k_best = {k_best_chi}')

# To determine best k in chi2() , we draw the learning curve
# score_list = []
# k_range = range(390, 150, -10)
# for i in k_range:
# 	X_chi2_plot = SelectKBest(chi2, k=i).fit_transform(X_var_2, Y)
# 	once = cross_val_score(RFC(n_estimators=10, random_state=0), X_chi2_plot, Y.flatten(), cv=5).mean()
# 	score_list.append(once)
# plt.plot(k_range, score_list)
# plt.show()

# F检验
# F , p_value = f_classif(X_var_2,Y.flatten())
# independentColumnsNo_of_F_classif = (p_value>0.05).sum()
# k_best_F = len(F) - independentColumnsNo_of_F_classif
# print(f'independentColumnsNo_of_F_classif = {independentColumnsNo_of_F_classif}')
# print(f'k_best_F = {k_best_F}')

# 互信息法
result = MIC(X_var_2, Y.flatten())
colno = sum(result <= 0)
k_best_mutual = len(result) - colno
print(f'colno={colno}')
print(f'k_best_mutual={k_best_mutual}')
def mRmR(X, Y, clf, n):
    """
    Feature Subset Selection Via Ensemble Method 'Max-Relevance, Min-Redundancy'
    Works only for continuous features, categorical labels.

    Params:
        X -> A np.array (2D) object representing the feature vector. 
             Each Column represents a feature, while each row represents a sample. 

        Y -> A np.array (1D) object representing the pattern class. 

        n -> Maximum number of features to select.

        clf -> Selected classifier as wrapper.
    """

    candidate_feature_indices = np.arange(X.shape[-1])
    feature_sets = []

    # Phase 1: Create Sequential Feature Sets [S1, S2, S3, ... Sn] #
    for i in range(n):
        print('Computing Feature Set S%s' % (i + 1))
        relevance = MID(X[:, candidate_feature_indices], Y)
        redundancy = np.zeros(len(relevance))

        try:
            for k in feature_sets[i - 1]:
                redundancy += MIC(X[:, candidate_feature_indices], X[:, k])
            redundancy /= len(redundancy)
        except:
            pass  # feature_sets -> Empty list

        score = relevance - redundancy
        best_feature_index = np.argmax(score)
        if feature_sets:
            feature_sets.append(
                feature_sets[-1] +
                [candidate_feature_indices[best_feature_index]])
        else:
            feature_sets.append(
                [candidate_feature_indices[best_feature_index]])

        candidate_feature_indices = np.delete(candidate_feature_indices,
                                              best_feature_index)

    # Phase 2: Validate Feature Set Performance #
    feature_set_scores = []
    for feature_set in feature_sets:
        kf = KFold(n_splits=5)
        avg_accuracy = 0
        for train_index, test_index in kf.split(X, Y):
            clf.fit(X[train_index][:, feature_set], Y[train_index])
            avg_accuracy += clf.score(X[test_index][:, feature_set],
                                      Y[test_index])
        feature_set_scores.append(avg_accuracy / 5)

    # Phase 3: Find Best Possible Subspace, For The Best Calculated Feature Space Sk #
    best_feature_subset = feature_sets[np.argmax(feature_set_scores)]
    best_subset_score = np.max(feature_set_scores)
    found_better_subset = True

    while found_better_subset and len(best_feature_subset) > 1:
        feature_subsets = [
            best_feature_subset[:k] + best_feature_subset[k + 1:]
            for k in range(len(best_feature_subset))
        ]
        feature_subset_scores = []

        for feature_set in feature_subsets:
            kf = KFold(n_splits=5)
            avg_accuracy = 0
            for train_index, test_index in kf.split(X, Y):
                clf.fit(X[train_index][:, feature_set], Y[train_index])
                avg_accuracy += clf.score(X[test_index][:, feature_set],
                                          Y[test_index])
            feature_subset_scores.append(avg_accuracy / 5)

        if np.any(feature_subset_scores > best_subset_score):
            best_subset_score = np.max(feature_subset_scores)
            best_feature_subset = feature_subsets[np.argmax(
                feature_subset_scores)]
        else:
            found_better_subset = False

    return best_feature_subset