Ejemplo n.º 1
0
def select_features(pair_data, labels, features):
    """
            进行特征选择
    """
    print '特征选择...'

    # 1. 过滤掉“低方差”的特征列
    vt_sel = VarianceThreshold(threshold=(0.85 * (1 - 0.85)))
    vt_sel.fit(pair_data)

    # 本次实验中没有需要过滤的特征,在这里只是举例
    sel_features1 = features[vt_sel.get_support()]
    sel_pair_data1 = pair_data[:, vt_sel.get_support()]
    print '“低方差”过滤掉%d个特征' % (features.shape[0] - sel_features1.shape[0])

    # 2. 根据“单变量统计分析”选择特征\
    # 保留重要的前90%的特征
    sp_sel = SelectPercentile(percentile=95)
    sp_sel.fit(sel_pair_data1, labels)

    sel_features2 = sel_features1[sp_sel.get_support()]
    sel_pair_data2 = sel_pair_data1[:, sp_sel.get_support()]
    print '“单变量统计分析”过滤掉%d个特征' % (sel_features1.shape[0] -
                                 sel_features2.shape[0])

    # 根据特征的score绘制柱状图
    feat_ser = pd.Series(data=sp_sel.scores_, index=features)
    sorted_feat_ser = feat_ser.sort_values(ascending=False)
    plt.figure(figsize=(18, 12))
    sorted_feat_ser.plot(kind='bar')
    plt.savefig('./feat_importance.png')
    plt.show()

    return sel_pair_data2, sel_features2
Ejemplo n.º 2
0
def selected_features(pair_data, labels, features):
    #过滤掉低方差的特征值
    vt_sel = VarianceThreshold(threshold=(0.85 * (1 - 0.85)))
    vt_sel.fit(pair_data)

    #本次试验中没有需要过滤的特征,在这里只是举例

    print 'vt_sel.get_support()====', vt_sel.get_support()
    sel_features1 = features[vt_sel.get_support()]

    sel_pair_data1 = pair_data[:, vt_sel.get_support()]
    print '低方差过滤掉%d个特征' % (features.shape[0] - sel_features1.shape[0])

    print 'features.shape[0]====', features.shape[0], '======', features.shape
    print 'sel_features1.shape[0]====', sel_features1.shape[
        0], '=========', sel_features1.shape

    #2  根据  单变量统计分析   选择特证
    #保留重要的前90%的特征
    sp_sel = SelectPercentile(percentile=95)
    sp_sel.fit(sel_pair_data1, labels)

    sel_features2 = sel_features1[sp_sel.get_support()]

    sel_pair_data2 = sel_pair_data1[:, sp_sel.get_support()]
    print '单变量统计分析过滤掉%d个特征' % (sel_features1.shape[0] - sel_features2.shape[0])

    # 根据特征scroe绘制柱状图
    feat_ser = pd.Series(data=sp_sel.scores_, index=features)
    sort_feat_ser = feat_ser.sort_values(ascending=False)
    plt.figure(figsize=(18, 12))
    sort_feat_ser.plot(kind='bar')
    plt.savefig('../feat_importance.png')
    plt.show()
    return sel_pair_data2, sel_features2
Ejemplo n.º 3
0
def test_select_percentile_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the percentile heuristic
    """
    X, Y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='percentile',
                                   param=25).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Ejemplo n.º 4
0
def test_select_percentile_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the percentile heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Ejemplo n.º 5
0
def select_features(pair_data, labels, features):
    # 1. 过滤掉“低方差”的特征列
    vt_sel = VarianceThreshold(threshold=(0.9*(1-0.9)))
    vt_sel.fit(pair_data)
#     print(vt_sel.get_support())
    
 
    #过滤掉噪声特征
    features = features[vt_sel.get_support()]
    
    pair_data = pair_data[:,vt_sel.get_support()]
#     print(pair_data)
    #得到最重要的95%的样本
    sp_sel = SelectPercentile(percentile=95)
    sp_sel.fit(pair_data, labels)
    features = features[sp_sel.get_support()]
    pair_data_1 = pair_data[:,sp_sel.get_support()]
#     print(pair_data_1)
    return pair_data_1,features
Ejemplo n.º 6
0
def test_select_percentile_regression_full():
    """
    Test whether the relative univariate feature selection
    selects all features when '100%' is asked.
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=100)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="percentile", param=100).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.ones(20)
    assert_array_equal(support, gtruth)
Ejemplo n.º 7
0
def test_select_percentile_regression_full():
    """
    Test whether the relative univariate feature selection
    selects all features when '100%' is asked.
    """
    X, Y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=100)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='percentile',
                    param=100).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.ones(20)
    assert_array_equal(support, gtruth)
Ejemplo n.º 8
0
def test_select_percentile_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the percentile heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="percentile", param=25).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
Ejemplo n.º 9
0
def test_select_percentile_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the percentile heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='percentile',
                    param=25).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))