def select_feature(trainfilename, testfilename): def returnTtest(X, y): return tvalue X_train, y_train, X_test, y_test = load_svmlight_files( (trainfilename, testfilename), multilabel=True) featureNum = X_train.get_shape()[1] tvalue = ttest(X_train, y_train) step = featureNum / 20 for i in range(1, 21): selectNum = step * i print "selecting", selectNum, "features" selector = SelectKBest(returnTtest, k=selectNum) X_train_new = selector.fit_transform(X_train, y_train) X_test_new = selector.transform(X_test) dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based=False) dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based=False)
def select_feature(trainfilename, testfilename): def returnCHI(X, y): return chivalue X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename), multilabel=True) featureNum = X_train.get_shape()[1] chivalue = chi2(X_train, y_train) step = featureNum / 20; for i in range(1, 21): selectNum = step * i print "selecting", selectNum, "features" selector = SelectKBest(chi2, k=selectNum) X_train_new = selector.fit_transform(X_train, y_train) X_test_new= selector.transform(X_test) dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False) dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)
def select_feature_multilabel(trainfilename, testfilename): def returnIG(X, y): return randval, p X_train, y_train, X_test, y_test = load_svmlight_files((trainfilename, testfilename), multilabel=True) featurenum = X_train.shape[1] randval = randomValues(X_train, y_train) p = np.ones((featurenum,1), int) p.reshape(featurenum,1) featureNum = X_train.get_shape()[1] step = featureNum / 20; for i in range(1, 21): selectNum = step * i print "selecting", selectNum, "features" selector = SelectKBest(returnIG, k=selectNum) X_train_new = selector.fit_transform(X_train, y_train) X_test_new = selector.transform(X_test) dump_svmlight_file(X_train_new, y_train, trainfilename + '_' + str(selectNum), zero_based = False) dump_svmlight_file(X_test_new, y_test, testfilename + '_' + str(selectNum), zero_based = False)