def get_user_feature(feature_type,behavior,num_feature=800): X_train = get_features(feature_type,behavior) index = X_train.index # 对X进行降维 Y = pd.read_csv('data/train_Y_%d.csv'%behavior, index_col='user_id')['type'] print 'start selectKbest...' # select = SelectKBest(chi2,k=min(num_feature,X_train.shape[1])) percent = 0 if feature_type == 'cat_id': percent = 60 elif feature_type == 'brand_id': percent = 15 elif feature_type == 'seller_id': percent = 20 select = SelectPercentile(f_classif, percentile=percent) select.fit(X_train,Y) X_train = select.transform(X_train) print 'end select...' print 'write %s features to train file' % feature_type train_feature_file_name = 'data/train_feature_%s_%d.csv' % (feature_type,behavior) DataFrame(X_train,index=index).to_csv(train_feature_file_name) # 用同样的列降维对应的测试集数据 X_test = get_features(feature_type,behavior,is_train=False) index = X_test.index X_test = select.transform(X_test) # 写入文件 print 'write %s features to test file' % feature_type test_feature_file_name = 'data/test_feature_%s_%d.csv' % (feature_type,behavior) DataFrame(X_test,index=index).to_csv(test_feature_file_name) print 'end....'
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def preprocess(words_file = "word_data.pkl", authors_file="email_authors.pkl"): ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data print "no. of Enrique training emails:", sum(labels_train) print "no. of Juan training emails:", len(labels_train)-sum(labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test
def preprocess(article_file, lable_file, k): features = pickle.load(open(article_file)) features = np.array(features) # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels lables = pickle.load(open(lable_file)) le = preprocessing.LabelEncoder() le.fit(lables) lables = le.transform(lables) # print le.inverse_transform([0]) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1, stop_words='english') features_train_transformed = vectorizer.fit_transform(features) # selector : SelectPercentile selector = SelectPercentile(f_classif, percentile=k) selector.fit(features_train_transformed, lables) # selector : chi2 # selector = SelectPercentile(score_func=chi2) # selector.fit(features_train_transformed, lables) features_train_transformed = selector.transform(features_train_transformed).toarray() return features_train_transformed, lables, vectorizer, selector, le, features
def selectFeatures(features, labels, features_list): ''' Select features according to the 20th percentile of the highest scores. Return a list of features selected and a dataframe showing the ranking of each feature related to their p values features: numpy array with the features to be used to test sklearn models labels: numpy array with the real output features_list: a list of names of each feature ''' #feature selection selector = SelectPercentile(f_classif, percentile=20) selector.fit(features, labels) features_transformed = selector.transform(features) #filter names to be returned l_rtn = [x for x, t in zip(features_list, list(selector.get_support())) if t] # pd.DataFrame(features_transformed, columns = l_labels2).head() #calculate scores scores = -np.log10(selector.pvalues_) scores /= scores.max() df_rtn = pd.DataFrame(pd.Series(dict(zip(features_list,scores)))) df_rtn.columns = ["pValue_Max"] df_rtn = df_rtn.sort("pValue_Max", ascending=False) # df_rtn["different_from_zero"]=((df!=0).sum()*1./df.shape[0]) return l_rtn, df_rtn
def select_features(X,y): selector = SelectPercentile(f_classif, percentile=10) print "fit selector" selector.fit(X, y) print "transform features" X = selector.transform(X) return X,selector
def test(X, y): ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_regression, percentile=20) selector.fit(X, y) print [zero_based_index for zero_based_index in list(selector.get_support(indices=True))]
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result # selector = SelectPercentile(f_classif, percentile=10) ## <Temporary hack for Lesson 3> selector = SelectPercentile(f_classif, percentile=1) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data print "no. of Chris training emails:", sum(labels_train) print "no. of Sara training emails:", len(labels_train)-sum(labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test
def selectFeatures(Model, X, y): model = Model() fsel = SelectPercentile(score_func=f_classif, percentile=5) fsel.fit(X, y) arr = fsel.get_support() print "features: ", np.where(arr == True) plt.hist(model.predict(X)) plt.hist(y) plt.show()
def getWeights(self): # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_classif, percentile=10) selector.fit(self.X, self.y) scores = -np.log10(selector.pvalues_) scores /= float(scores.max()) return scores
def selectFeatures(X, y): # feature selection with F-test for feature scoring # 10% most significant features selector = SelectPercentile(f_classif, percentile=10) selector.fit(X, y) scores = -np.log10(selector.pvalues_) scores /= scores.max() return selector, scores
def eval(ds, testNum, p, splitProportion=0.2): #testNum=1 #splitProportion=0.2 allFeaturesF1=[] allFeaturesRecall=[] allFeaturesPrecision=[] featureSelctedF1=[] featureSelctedRecall = [] featureSelctedPrecision = [] for _ in range(testNum): tstdata, trndata = ds.splitWithProportion( splitProportion ) X, Y = labanUtil.fromDStoXY(trndata) X_test, Y_test = labanUtil.fromDStoXY(tstdata) #localF1s = [] #localRecalls = [] #localPercisions = [] for y, y_test in zip(Y, Y_test): if all(v == 0 for v in y): continue #clf = LinearSVC()#fit_intercept=True, C=p) #clf.sparsify() #clf = RandomForestClassifier()#criterion='entropy') #clf = tree.DecisionTreeClassifier()#max_depth=p) clf = AdaBoostClassifier() #clf = GradientBoostingClassifier()#, learning_rate=lr) #clf = ExtraTreesClassifier(n_estimators=p) #svc = LinearSVC() #selector = RFE(estimator=svc, n_features_to_select=p*19, step=0.2) selector = SelectPercentile(chooser, percentile=p) selector.fit(X, y) name = str(clf).split()[0].split('(')[0] clf.fit(selector.transform(X), y) pred = clf.predict(selector.transform(X_test)) featureSelctedF1.append(metrics.f1_score(y_test, pred)) featureSelctedRecall.append(metrics.recall_score(y_test, pred)) featureSelctedPrecision.append(metrics.precision_score(y_test, pred)) clf.fit(X, y) pred = clf.predict(X_test) allFeaturesF1.append(metrics.f1_score(y_test, pred)) allFeaturesRecall.append(metrics.recall_score(y_test, pred)) allFeaturesPrecision.append(metrics.precision_score(y_test, pred)) return np.mean(allFeaturesF1), np.mean(featureSelctedF1), \ np.mean(allFeaturesRecall), np.mean(featureSelctedRecall), \ np.mean(allFeaturesPrecision), np.mean(featureSelctedPrecision), \ name
def predict(classifier_type="tree",selection="Univariate", f="1"): if (f=="1"): kc_fn = "GS_pickles\kmeans_Genes_87_1x_v3.pkl" p = 1 BIG_C = 0.001 if (f=="2"): kc_fn = "GS_pickles\kmeans_Genes_433_50x_v2.pkl" p = 5 BIG_C = 0.1 if (f=="3"): kc_fn = "GS_pickles\kmeans_Genes_2163_20x_v1.pkl" p = 25 BIG_C = 2 dump_data = False kernel_type = "linear" (data_matrix, features, samples) = readData() x = data_matrix.data y = data_matrix.target target_names = data_matrix.target_names x_indices = np.arange(x.shape[-1]) (m,n) = x.shape test = joblib.load("GS_pickles\imputed_test_data.pkl") test_x = np.array(test) (i,j) = test_x.shape print "Training matrix shape: %s,%s" %(m,n) print "Test matrix shape: %s,%s" %(i,j) trimmed_x = [] trimmed_test_x = [] if (selection=="Univariate"): selector = SelectPercentile(f_classif, percentile=p) selector.fit(x, y) # Trimming the matrix, now should contain x% of the 8650 features trimmed_x = selector.transform(x) trimmed_test_x = selector.transform(test_x) if (selection=="kclusters"): kcluster_flist = joblib.load(kc_fn) trimmed_x = np.take(x, kcluster_flist, axis=1) trimmed_test_x = np.take(test_x, kcluster_flist, axis=1) n_samples, n_features = trimmed_x.shape # Linear SVM classifier if (classifier_type=="SVM"): clf = svm.SVC(kernel=kernel_type, degree=3, probability=True) # Gaussian Naive Bayes classifier if (classifier_type=="NB"): clf = GaussianNB() clf.fit(trimmed_x,y) result = clf.predict(trimmed_test_x) return result
def univariate_feature_selection(dataset, features): # load the dataset spreadsheet = Spreadsheet('../../Downloads/ip/project data.xlsx') data = Data(spreadsheet) targets = data.targets X = dataset y = data.targets ############################################################################### plt.figure(1) plt.clf() X_indices = np.arange(X.shape[-1]) ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_classif, percentile=10) selector.fit(X, y) scores = -np.log10(selector.pvalues_) scores /= scores.max() plt.bar(X_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)', color='g') ############################################################################### # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(X, y) svm_weights = (clf.coef_ ** 2).sum(axis=0) svm_weights /= svm_weights.max() plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight', color='r') clf_selected = svm.SVC(kernel='linear') clf_selected.fit(selector.transform(X), y) svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0) svm_weights_selected /= svm_weights_selected.max() plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, width=.2, label='SVM weights after selection', color='b') x = np.arange(0, len(features)) plt.title("Comparing feature selection") plt.xlabel('Feature number') plt.xticks(x, features, rotation=45) plt.yticks(()) #plt.axis('tight') plt.legend(loc='upper right') plt.show()
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project # read a vector of documents from file(decoded) authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() # read a vector of labels/authors from file(decoded) words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) # features_train,features_test is a vector of sentences ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) # no fitting here. So the idf is the one calculated initially # returns sparse matrix(N*M) where N = each document/sample, M gives tf*invdf weightage of current feature word in document. ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) # select top k% best features using univariate statistical tests features_train_transformed = selector.transform(features_train_transformed).toarray() # select the columns based on the stats test features_test_transformed = selector.transform(features_test_transformed).toarray() # do as above ### info on the data #print "no. of Chris training emails:", sum(labels_train) #print "no. of Sara training emails:", len(labels_train)-sum(labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test
def preprocess(words_file="../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "rb") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "rb") word_data = pickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split( word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform( features_train_transformed).toarray() features_test_transformed = selector.transform( features_test_transformed).toarray() ### info on the data print("no. of Chris training emails:", sum(labels_train)) print("no. of Sara training emails:", len(labels_train) - sum(labels_train)) return features_train_transformed, features_test_transformed, labels_train, labels_test
def features_weight(X, y, ascending=False): ''' Paremeters ---------- X: pd.DataFrame y: pd.Series ''' selector = SelectPercentile(f_classif) selector.fit(X.values, y.values) scores = -np.log10(selector.pvalues_) scores /= scores.max() ans = pd.Series(scores, index=X.columns) return ans.order(ascending=ascending)
def select_percentile(self, feature_train, label_train, feature_test): """ parameter: feature_train: array of shape [n_samples, n_features] feature_test: array of shape [n_samples, n_features] return: array of shape [n_samples, n_selected_features], array of shape [n_samples, n_selected_features] """ selector = SelectPercentile(percentile=self.__context_manager.percentile) selector.fit(feature_train, label_train) feature_train, feature_test = selector.transform(feature_train).toarray(), selector.transform(feature_test).toarray() return feature_train, feature_test
def select(p, x_train, x_test, y_trian, y_test): #copy dataframes x_train_selected = x_train.copy() x_test_selected = x_test.copy() #p: percentage of remaining columns select = SelectPercentile(percentile=p) select.fit(x_train_selected, y_train) x_train_selected = select.transform(x_train_selected) x_test_selected = select.transform(x_test_selected) #train & test lr_selected = skl_lm.LogisticRegression() lr_selected.fit(x_train_selected, y_train) return (lr_selected.score(x_test_selected, y_test), select.get_support())
def selectFeatures(features_train, labels_train, features_test, percentile=10, runInfo=None): selector = SelectPercentile(f_classif, percentile=percentile) selector.fit(features_train, labels_train) features_train_transformed = selector.transform(features_train) features_test_transformed = selector.transform(features_test) if runInfo is not None: runInfo["Selected Features:"] = "Perc = {}, Num = {}".format( percentile, len(features_train_transformed[0])) return features_train_transformed, features_test_transformed
def make_train_test(df_train, df_test): vectorizer = CountVectorizer() X_train = vectorizer.fit_transform(df_train['Phrase'].values) Y_train = df_train['Sentiment'].values X_test = vectorizer.transform(df_test['Phrase'].values) selector = SelectPercentile(f_classif, percentile=50) selector.fit(X_train, Y_train) features_train_transformed = selector.transform(X_train) features_test_transformed = selector.transform(X_test) return features_train_transformed, Y_train, features_test_transformed
def fselect_unstat(self, prec=20): """ use p value to exclude variables. prec is precentige of features selceted""" from sklearn.feature_selection import SelectPercentile select = SelectPercentile(percentile=prec) select.fit(self.X1, self.y1) self.X1 = select.transform(self.X1) self.X2 = select.transform(self.X2) if self.X3: self.X3 = select.transform(self.X3) print("Selected feat using unsata model:", self.head[select.get_support()], len(self.head[select.get_support()])) self.head = self.head[select.get_support()]
def select_best_perc(selected_percentile, features_train, labels_train): ''' Select features with SelectPercentile ''' select = SelectPercentile(percentile=selected_percentile) # Fit data select.fit(features_train, labels_train) # Get features score feature_scores = np.array(select.scores_) mask = select.get_support() # Transform features and labels features_train_selected = select.transform(features_train) features_test_selected = select.transform(features_test) return mask, features_train_selected, features_test_selected, feature_scores
def select_percentile(): x = [[0.6, 2, 3], [2.5, 4, 6], [3.4, 6.2, 9.4]] y = [1, 2, 3] print(x) selector = SelectPercentile(score_func=f_regression, percentile=100) selector.fit(x, y) print(selector.scores_) print(selector.pvalues_) print(selector.get_support(True)) print(selector.transform(x)) pass
def preprocess( main_file="underwriter.csv", plan_file="plan_name.csv"): #include pkl file (if csv not working) """ this function takes a pre-made list of plan names (by default underwriter.csv) and the corresponding authors (by default plan_name.csv) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project plan_file_handler = open(plan_file, "r") plans = pickle.load(plan_file_handler) plan_file_handler.close() main_file_handler = open(main_file, "r") main_data = cPickle.load(main_file_handler) main_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split( main_data, plans, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform( features_train_transformed).toarray() features_test_transformed = selector.transform( features_test_transformed).toarray() return features_train_transformed, features_test_transformed, labels_train, labels_test
def sel_per(X_train, y_train, X_test, y_test): sel_per = SelectPercentile(percentile=50) # use f_classif and 50% percentile sel_per.fit(X_train, y_train) X_train_selected = sel_per.transform(X_train) # select X train print('X_train_shape: {}'.format(X_train.shape)) print('X_train_selected.shape: {}'.format(X_train_selected.shape)) mask = sel_per.get_support() X_test_selected = sel_per.transform(X_test) lr = LogisticRegression() lr.fit(X_train, y_train) print('LR score with all features: {:.3f}'.format(lr.score(X_test, y_test))) lr.fit(X_train_selected, y_train) print('LR score with selected features: {:.3f}'.format(lr.score(X_test_selected, y_test))) return mask
def getFeatures(self, number_of_features=10): # X = self.training.iloc[:,:,-1] y = self.training['TARGET'] X = self.training.drop(['TARGET'], axis=1) #Select features according to the k highest scores. #selectFeatures = SelectBest(chi2, k= number_of_features) #Select the best 10 percentile # We can use other classifier as well for Selection like chi2 selectFeatures = SelectPercentile(f_classif, percentile= number_of_features) selectFeatures.fit(X, y) # X_select = selectFeatures.transform(X) features = selectFeatures.get_support(indices=True) # print("Best feature: "+ features[0]) return(features)
def compute_feature_statistics(train_X, train_Y): ''' Univariate Feature Selection - see sklearn: http://scikit-learn.org/dev/auto_examples/plot_feature_selection.html#example-plot-feature-selection-py Features are not removed, only statistics computed ''' selector = SelectPercentile(f_classif, percentile=10) selector.fit(train_X, train_Y) scores = -np.log10(selector.pvalues_) scores /= scores.max() return scores, selector
def prepareTrainingData(): # Einlesen der Trainingsdaten und casten in Numpy Array labels = np.array(getLabels()) features = np.array(featureLigands(getLigands(), setAllFeatures())) # Skalierung der Trainingsdaten mit MinMaxScaler scaler = preprocessing.MinMaxScaler() scaled_features = scaler.fit_transform(features) # Feature Selection mit 10 Percentil selector = SelectPercentile(f_classif, percentile=10) selector.fit(scaled_features, labels) # Speichere Selektor um ihn bei INPUT später auch zu verwenden joblib.dump(selector, '../selector/selector.pkl') selected_features = selector.transform(scaled_features) return selected_features, labels
class FeatureSelection: """ 特征选择 percentile:选取特征的百分比 """ def __init__(self,percentile=70): self.percentile=percentile def fit(self,x,y): self.sepChi=SelectPercentile(score_func=chi2,percentile=self.percentile)#使用卡方 self.sepChi.fit(x,y) def transform(self,x,y): return (self.sepChi.transform(x),y)
def preprocess_4(article_file, lable_file): # article_file = "pkl/2013_article.pkl" # lable_file = "pkl/2013_lable.pkl" features = pickle.load(open(article_file)) features = np.array(features) # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels lables = pickle.load(open(lable_file)) le = preprocessing.LabelEncoder() le.fit(lables) lables = le.transform(lables) ### test_size is the percentage of events assigned to the test set (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, lables, test_size=0.1, random_state=42) # print features_train.shape # print features_test[0] # print features_test.shape ### text vectorization--go from strings to lists of numbers t0 = time() vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) # print "features_train_transformed is {}".format(features_train_transformed.shape) # print "features_test_transformed is {}".format(features_test_transformed.shape) # print "vectorizer time:", round(time()-t0, 3), "s" # print len(vectorizer.get_feature_names()) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result t0 = time() selector = SelectPercentile(f_classif, percentile=30) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() # print "features_train_transformed is {}".format(features_train_transformed.shape) # print "features_test_transformed is {}".format(features_test_transformed.shape) # print "selector time:", round(time()-t0, 3), "s" # print len(vectorizer.get_feature_names()) # print vectorizer.get_feature_names()[0:-10] # print len(selector.scores_) return features_train_transformed, features_test_transformed, labels_train, labels_test
def preprocesa(features, labels): features_train, features_test,labels_train,labels_test=cross_validation.train_test_split(features,labels,test_size=0.1,random_state=42) vectorizer = TfidfVectorizer(sublinear_tf = True, max_df = 0.5, stop_words = "english") features_train = vectorizer.fit_transform(features_train) features_test = vectorizer.transform(features_test) joblib.dump(vectorizer, 'vectorizer.pkl') selector = SelectPercentile(f_classif, percentile = 10) selector.fit(features_train, labels_train) joblib.dump(selector, 'selector.pkl') features_test =selector.transform(features_test).toarray() features_train = selector.transform(features_train).toarray() return features_train, features_test, labels_test, labels_train
def Preprocess( words_file="/home/mohamed/python/sherlok-tools/helpers/word_data.pkl", labels_file="/home/mohamed/python/sherlok-tools/helpers/label_data.pkl" ): """ this function takes a pre-made list of data texts (by default word_data.pkl) and the corresponding labels (by default label_data.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and labels (positive or negative) word_data = pickle.load(open(words_file, "r")) labels = pickle.load(open(labels_file, "r")) ### test_size is the percentage of events assigned to the test set (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split( word_data, labels, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, encoding='windows-1256') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform( features_train_transformed).toarray() features_test_transformed = selector.transform( features_test_transformed).toarray() ### info on the data print "no. of positive training files:", sum(labels_train) print "no. of negative training files:", len(labels_train) - sum( labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test
def compute_feature_statistics(train_X, train_Y): """ Univariate Feature Selection - see sklearn: http://scikit-learn.org/dev/auto_examples/plot_feature_selection.html#example-plot-feature-selection-py Features are not removed, only statistics computed """ selector = SelectPercentile(f_classif, percentile=10) selector.fit(train_X, train_Y) scores = -np.log10(selector.pvalues_) scores /= scores.max() return scores, selector
def construct_features(train_X, train_Y, test_X, have_poly=True): # find the most important features sel = SelectPercentile(f_regression, percentile=20) sel.fit(train_X, train_Y) sup = sel.get_support() sel_idx = np.where(sup == True)[0] sel_names = [names[i] for i in sel_idx] sel_names = [n for i,n in enumerate(sel_names) if floats[sel_idx[i]] == True] # feature construction d = {} # training d_t = {} # testing # construct features by combining 2 different features for i, n1 in enumerate(sel_names): for j, n2 in enumerate(sel_names): if i != j: new_col = train_X[n1] * train_X[n2] new_col_t = test_X[n1] * test_X[n2] new_name = n1 + '*' + n2 d[new_name] = new_col d_t[new_name] = new_col_t comb_X = pandas.DataFrame(data=d) comb_X_t = pandas.DataFrame(data=d_t) if have_poly is False: new_X = train_X.join(comb_X) new_test = test_X.join(comb_X_t) return new_X, new_test # construct features by making polynimial terms float_names = [n for i,n in enumerate(names) if floats[i] == True] quad_X = train_X[float_names] ** 2 quad_X_t = test_X[float_names] ** 2 quad_X.columns = [n + '^2' for n in float_names] quad_X_t.columns = [n + '^2' for n in float_names] tri_X = train_X[float_names] ** 3 tri_X_t = test_X[float_names] ** 3 tri_X.columns = [n + '^3' for n in float_names] tri_X_t.columns = [n + '^3' for n in float_names] poly_X = quad_X.join(tri_X) poly_X_t = quad_X_t.join(tri_X_t) comb_X = comb_X.join(poly_X) comb_X_t = comb_X_t.join(poly_X_t) new_X = train_X.join(comb_X) new_test = test_X.join(comb_X_t) return new_X, new_test
def test_select_percentile_4(): """Ensure that the TPOT select percentile outputs the same result as sklearn select percentile when 0 < percentile < 100""" tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UserWarning) selector = SelectPercentile(f_classif, percentile=42) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(tpot_obj._select_percentile(training_testing_data, 42), training_testing_data[mask_cols])
def test_select_percentile_float(self): model = SelectPercentile() X = np.array( [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]], dtype=np.float32) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, "select percentile", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnSelectPercentile")
def percentile_k_features(df, k=20): X = df.drop(['SalePrice'], axis=1) y = df['SalePrice'] select_p = SelectPercentile(f_regression, percentile=k) select_p.fit(X, y) select_p.scores_ d = dict() for n, s in zip(X.columns, select_p.scores_): d[n] = s sorted_data = sorted(d.items(), key=lambda kv: kv[1], reverse=True) features = list() features = sorted_data[:7] features = [f for (f, v) in features] print(features) return features
def feature_selection_regression(data): """This function finds the important features using mutual information regression under a percentile value. Input: data: The dataframe. Output: Returns the top important features under 30 percentiles. """ X=data.drop('How are you feeling right now?',axis=1) y=data['How are you feeling right now?'] select=SelectPercentile(mutual_info_regression,percentile=30) select.fit(X,y) return X.columns[select.get_support()]
def feature_selection_linear(data): """This function finds the important features using mutual information regression under a percentile value. It is only for Linear_Regression.ipynb. Input: data: The dataframe. Output: Returns the top important features under 30 percentiles. """ X=data.drop('On a scale of 1-100, how would you express this feeling?',axis=1) y=data['On a scale of 1-100, how would you express this feeling?'] select=SelectPercentile(mutual_info_regression,percentile=30) select.fit(X,y) return X.columns[select.get_support()]
def trainingPreprocess(words_file, authors_file): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 6 objects are returned: -- training/testing features -- training/testing labels -- a fitted vectorizer -- a fitted selector """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection selector = SelectPercentile(f_classif, percentile=10) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() return features_train_transformed, features_test_transformed, labels_train, labels_test, vectorizer, selector
def main(): #set the timer start = time.time() #load the data trainX = np.load('trainX.npy') testX = np.load('testX.npy') trainY = np.load('trainY.npy') testY = np.load('testY.npy') print('\n!!! Data Loading Completed !!!\n') #get the 1st digit zero and plot it zero = trainX[14].reshape(28, 28) plt.imshow(zero, cmap=cm.Greys_r) plt.savefig("original"+str(trainY[14])+".png") #plt.show() #apply kpca kpca = KernelPCA(kernel='rbf', gamma=1, fit_inverse_transform=True) kpca.fit(trainX[0:3000]) trainX_kpca = kpca.transform(trainX) testX_kpca = kpca.transform(testX) #do inverse transform and plot the result orig = kpca.inverse_transform(trainX_kpca) img = orig[14].reshape(28, 28) plt.imshow(img, cmap=cm.Greys_r) plt.savefig("reconstructed"+str(trainY[14])+".png") #plt.show() selector = SelectPercentile(f_classif, percentile=5) selector.fit(trainX_kpca, trainY) trainX = selector.transform(trainX_kpca) testX = selector.transform(testX_kpca) #fit a classifier parameters = {'n_neighbors' : list(np.arange(15)+1)} clf = GridSearchCV(KNeighborsClassifier(weights='distance', n_jobs=-1), parameters) clf.fit(trainX, trainY) pred = clf.predict(testX) print accuracy_score(testY, pred) print confusion_matrix(testY, pred) #print(clf.best_params_) print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY))) print('Test Time : %f Minutes\n' %((time.time()-start)/60))
def make_dataset(list_of_vocabs, emp_name_abs, df_without_outliers, employees_w_email_dir): X_train, X_test, y_train, y_test = model_selection.train_test_split( list_of_vocabs, emp_name_abs, test_size = 0.1) vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english', max_df = 0.5) features_train_transformed = vectorizer.fit_transform(X_train) features_test_transformed = vectorizer.transform(X_test) features_names = np.array(vectorizer.get_feature_names()) selector = SelectPercentile(f_classif, percentile = 0.01) selector.fit(features_train_transformed, y_train) important_features = selector.get_support(indices=False) scores = selector.scores_ scores = scores[important_features] features_train_transformed =\ selector.transform(features_train_transformed) features_test_transformed =\ selector.transform(features_test_transformed) features_train_transformed = features_train_transformed.toarray() features_test_transformed = features_test_transformed.toarray() features = np.concatenate((features_train_transformed, features_test_transformed)) labels = np.concatenate((y_train, y_test)) scaler = preprocessing.MinMaxScaler() rescaled_weight = scaler.fit_transform(features) features_of_interest = features_names[important_features] f_length = len(features_of_interest) scores_report =\ {features_of_interest[i]:scores[i] for i in xrange(f_length)} return features, labels, features_of_interest, scores_report
def get_feature_args(self, x, y, percentile=80, k=40): if self.feature_selection == 'info': info_score = mutual_info_classif(x, y) self.features_to_use = np.argwhere(info_score > 0).ravel() if len(self.features_to_use) <= 1: self.features_to_use = np.argwhere(x.std(axis=0) > 0).ravel() elif self.feature_selection == 'percentile': selector = SelectPercentile(percentile=percentile) selector.fit(x, y) self.features_to_use = np.argwhere(selector.get_support()).ravel() elif self.feature_selection == 'kbest': k = np.min([int(np.ceil(percentile * x.shape[1] / 100)), k]) selector = SelectKBest(k=k).fit(x, y) self.features_to_use = np.argwhere(selector.get_support()).ravel() else: self.features_to_use = np.argwhere(x.std(axis=0) > 0).ravel()
def test_select_percentile_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the percentile heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r)) # Check inverse_transform respects dtype assert_array_equal(X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool)))
def test_select_percentile_float(self): model = SelectPercentile() X = np.array([[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]]) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, 'select percentile', [('input', FloatTensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnSelectPercentile", allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.4')")
def test_select_percentile_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_percentile_classif_sparse(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) X = sparse.csr_matrix(X) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r.toarray(), X_r2.toarray()) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_r2inv = univariate_filter.inverse_transform(X_r2) assert_true(sparse.issparse(X_r2inv)) support_mask = safe_mask(X_r2inv, support) assert_equal(X_r2inv.shape, X.shape) assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray()) # Check other columns are empty assert_equal(X_r2inv.getnnz(), X_r.getnnz())
def automatic_feature_selection_select_percentile(data): #print data x_train, x_test, y_train, y_test = train_test_split(data.drop( ['target', 'bi_gram_score', 'type', 'source', 'curID', 'rawClue'], axis=1), data.target_label, random_state=0, test_size=0.5) select = SelectPercentile(percentile=50) select.fit(x_train, y_train) x_train_selected = select.transform(x_train) for k, v in enumerate(select.get_support()): if v == True: print x_train.columns[k] print x_train_selected.shape
def get_top_chi2_candidate_ngrams(queries, f_extractor, percentile): """Get top ngrams features according to chi2. """ ngrams_dict = dict() features, labels = construct_examples(queries, f_extractor) label_encoder = LabelEncoder() labels = label_encoder.fit_transform(labels) vec = DictVectorizer(sparse=True) X = vec.fit_transform(features) # ch2 = SelectKBest(chi2, k=n_features) ch2 = SelectPercentile(chi2, percentile=percentile) ch2.fit(X, labels) indices = ch2.get_support(indices=True) for i in indices: ngrams_dict[vec.feature_names_[i]] = 1 return ngrams_dict
def percentile_k_features(data,k=20): X = data.iloc[:,:-1] y = data.iloc[:,-1] X_new = SelectPercentile(f_regression,percentile=20).fit_transform(X,y) #s=np.asarray(X_new.get_support()) s1=SelectPercentile(f_regression,percentile=20) s1.fit(X,y) s2 = s1.get_support() s3=X.columns s4= s3[s2] s5 = s4.tolist() s6 = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath'] return s6
def _feature_selection(matrix, method="PCA", target=None): print("--Selecting features with {} ".format(method)) target_components = 300 if method == "PCA": from sklearn.decomposition import PCA pca = PCA(n_components=target_components) reduced_matrix = pca.fit_transform(matrix) if method == "SVD": from sklearn.decomposition import TruncatedSVD lsa = TruncatedSVD(n_components=target_components) reduced_matrix = lsa.fit_transform(matrix) if method == "SelectKBest": if target is None: raise Exception("No target found on supervised _feature_selection") from sklearn.feature_selection import SelectKBest, chi2 X, y = matrix, target reduced_matrix = SelectKBest(chi2, k=target_components).fit_transform(X, y) if method == "LinearSVC": if target is None: raise Exception("No target found on supervised _feature_selection") from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel X, y = matrix, target lsvc = LinearSVC(C=0.5, penalty="l1", dual=False).fit(X, y) model = SelectFromModel(lsvc, prefit=True) reduced_matrix = model.transform(X) if method == "SelectPercentile": from sklearn.feature_selection import SelectPercentile, f_classif X, y = matrix, target selector = SelectPercentile(f_classif, percentile=10) reduced_matrix = selector.fit(X, y) return reduced_matrix
class Feature_Selection: def __init__(self, n_features, problem_type, scoring): valid_scoring = dict() if problem_stype == 'classification': valid_scoring['f_classif'] = f_classif valid_scoring['chi2'] = chi2 valid_scoring['mutual_info_classif'] = mutual_info_classif else: valid_scoring['f_regression'] = f_regression valid_scoring['mutual_info_regression'] = mutual_info_regression if scoring not in valid_scoring: raise Exception('Invalid Scoring Type') if isinstance(n_features, int): self.selection = SelectKBest(valid_scoring[scoring], k=n_features) elif isinstance(n_features, float): self.selection = SelectPercentile(valid_scoring[scoring], percentile=int(100 * n_features)) else: raise Exception('Invalid Type of Feature') def fit(self, x, y): return self.selection.fit(X, y) def transform(self, X): return self.selection.transform(X) def fit_transform(self, X, y): return self.selection.fit_transform(X, y)
def test_select_percentile_classif_sparse(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) X = sparse.csr_matrix(X) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X) assert_array_equal(X_r.toarray(), X_r2.toarray()) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_r2inv = univariate_filter.inverse_transform(X_r2) assert_true(sparse.issparse(X_r2inv)) support_mask = safe_mask(X_r2inv, support) assert_equal(X_r2inv.shape, X.shape) assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray()) # Check other columns are empty assert_equal(X_r2inv.getnnz(), X_r.getnnz())
def test_select_percentile_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_percentile_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the percentile heuristic """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect( f_regression, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r)) # Check inverse_transform respects dtype assert_array_equal(X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool)))
def get_combined_separate_fsets(feature_sets, fs_fn='pct', ptile=10, nFeatures=5, score_fn=f_classif): df_lst = [] for fset_name, df in feature_sets.items(): X_train = df[df.partition == 'train'].drop(['partition', 'fatality_ind'], axis=1) y_train = df[df.partition == 'train'].fatality_ind df_X = df.drop(['partition', 'fatality_ind'], axis=1) if fs_fn == 'pct': featureSelector = SelectPercentile(score_func=score_fn, percentile=ptile) else: featureSelector = SelectKBest(score_func=score_fn, k=nFeatures) featureSelector.fit(X_train, y_train) fs = featureSelector.transform(df.drop(['partition', 'fatality_ind'], axis=1)) cols_fs = df_X.columns[list(featureSelector.get_support(indices=True))] cols_fs_ref = [fset_name + ' ' + c for c in cols_fs] df_fs = pd.DataFrame(fs, index=df_X.index, columns=cols_fs_ref) df_lst.append(df_fs) df_comb = df[['partition', 'fatality_ind']].join(pd.concat(df_lst, axis=1)) return df_comb
def feature_transform(features_train, features_test, top_percent=1): """ Function to apply Bag of Words feature creator with TfIdf statistic normalisation. The input is train and test text, and optional parameter 'top_percent' which shows how many percent of super high dimensional text feature space is to return (defaul is 1%). The output is the transformed train and test feature vectors suitable to use with sklearn classifiers. """ vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### Feature selection, because text is super high dimensional selector = SelectPercentile(f_classif, percentile=top_percent) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() return features_train_transformed, features_test_transformed