def select_features(pair_data, labels, features): """ 进行特征选择 """ print '特征选择...' # 1. 过滤掉“低方差”的特征列 vt_sel = VarianceThreshold(threshold=(0.85 * (1 - 0.85))) vt_sel.fit(pair_data) # 本次实验中没有需要过滤的特征,在这里只是举例 sel_features1 = features[vt_sel.get_support()] sel_pair_data1 = pair_data[:, vt_sel.get_support()] print '“低方差”过滤掉%d个特征' % (features.shape[0] - sel_features1.shape[0]) # 2. 根据“单变量统计分析”选择特征\ # 保留重要的前90%的特征 sp_sel = SelectPercentile(percentile=95) sp_sel.fit(sel_pair_data1, labels) sel_features2 = sel_features1[sp_sel.get_support()] sel_pair_data2 = sel_pair_data1[:, sp_sel.get_support()] print '“单变量统计分析”过滤掉%d个特征' % (sel_features1.shape[0] - sel_features2.shape[0]) # 根据特征的score绘制柱状图 feat_ser = pd.Series(data=sp_sel.scores_, index=features) sorted_feat_ser = feat_ser.sort_values(ascending=False) plt.figure(figsize=(18, 12)) sorted_feat_ser.plot(kind='bar') plt.savefig('./feat_importance.png') plt.show() return sel_pair_data2, sel_features2
def selected_features(pair_data, labels, features): #过滤掉低方差的特征值 vt_sel = VarianceThreshold(threshold=(0.85 * (1 - 0.85))) vt_sel.fit(pair_data) #本次试验中没有需要过滤的特征,在这里只是举例 print 'vt_sel.get_support()====', vt_sel.get_support() sel_features1 = features[vt_sel.get_support()] sel_pair_data1 = pair_data[:, vt_sel.get_support()] print '低方差过滤掉%d个特征' % (features.shape[0] - sel_features1.shape[0]) print 'features.shape[0]====', features.shape[0], '======', features.shape print 'sel_features1.shape[0]====', sel_features1.shape[ 0], '=========', sel_features1.shape #2 根据 单变量统计分析 选择特证 #保留重要的前90%的特征 sp_sel = SelectPercentile(percentile=95) sp_sel.fit(sel_pair_data1, labels) sel_features2 = sel_features1[sp_sel.get_support()] sel_pair_data2 = sel_pair_data1[:, sp_sel.get_support()] print '单变量统计分析过滤掉%d个特征' % (sel_features1.shape[0] - sel_features2.shape[0]) # 根据特征scroe绘制柱状图 feat_ser = pd.Series(data=sp_sel.scores_, index=features) sort_feat_ser = feat_ser.sort_values(ascending=False) plt.figure(figsize=(18, 12)) sort_feat_ser.plot(kind='bar') plt.savefig('../feat_importance.png') plt.show() return sel_pair_data2, sel_features2
def test_select_percentile_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the percentile heuristic """ X, Y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='percentile', param=25).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_percentile_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the percentile heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def select_features(pair_data, labels, features): # 1. 过滤掉“低方差”的特征列 vt_sel = VarianceThreshold(threshold=(0.9*(1-0.9))) vt_sel.fit(pair_data) # print(vt_sel.get_support()) #过滤掉噪声特征 features = features[vt_sel.get_support()] pair_data = pair_data[:,vt_sel.get_support()] # print(pair_data) #得到最重要的95%的样本 sp_sel = SelectPercentile(percentile=95) sp_sel.fit(pair_data, labels) features = features[sp_sel.get_support()] pair_data_1 = pair_data[:,sp_sel.get_support()] # print(pair_data_1) return pair_data_1,features
def test_select_percentile_regression_full(): """ Test whether the relative univariate feature selection selects all features when '100%' is asked. """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="percentile", param=100).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth)
def test_select_percentile_regression_full(): """ Test whether the relative univariate feature selection selects all features when '100%' is asked. """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=100).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth)
def test_select_percentile_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the percentile heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="percentile", param=25).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
def test_select_percentile_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the percentile heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=25).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
from sklearn.externals import joblib #get the data_txt from DB numDimensions = 22 numFolds = 5 X_train = uux_data.getUUXSentences(numDimensions) y_train = uux_data.getUUXSentenceDimension(numDimensions) y_train_binary = MultiLabelBinarizer().fit_transform(y_train) target_names = uux_data.getUUXDimensions(numDimensions) #data_txt preproccessing - tokenization, selecting 90% of the best features vectorizer = TfidfVectorizer(tokenizer=uux_preprocessing.tokenize) X_train_features = vectorizer.fit_transform(X_train) X_train_features_names = vectorizer.fit(X_train).vocabulary_ ch2 = SelectPercentile(chi2, percentile=16) X_train_features = ch2.fit_transform(X_train_features, y_train_binary) selected_features_names = np.asarray(vectorizer.get_feature_names())[ch2.get_support()] print str(len(selected_features_names)) classifier = Pipeline([ ('tfidf', vectorizer), ('chi2', ch2), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(X_train, y_train_binary) joblib.dump(classifier, 'classifier/uux_classifier.pkl')
p = np.empty([numFolds]) ch2 = SelectPercentile(chi2, percentile=perc) #perfrom 5folds cross-validation for i in range(0, numFolds): #data_txt preproccessing - tokenization, selecting 90% of the best features vectorizer = TfidfVectorizer(tokenizer=uux_preprocessing.tokenize) X_train_features = vectorizer.fit_transform(x_train_folds[i]) X_train_features_names = vectorizer.fit(x_train_folds[i]).vocabulary_ X_train_features = ch2.fit_transform(X_train_features, y_train_folds[i]) selected_features_names = np.asarray( vectorizer.get_feature_names())[ch2.get_support()] classifier = Pipeline([('tfidf', vectorizer), ('chi2', ch2), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(x_train_folds[i], y_train_folds[i]) predicted = classifier.predict(x_test_folds[i]) print metrics.precision_score(y_test_folds[i], predicted) p[i] = metrics.precision_score(y_test_folds[i], predicted) print p results = np.append(results, p.mean()) print "Results" print results