def select_features(X, y): from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import f_classif,chi2 from sklearn.preprocessing import Binarizer, scale # First select features based on chi2 and f_classif p = 3 X_bin = Binarizer().fit_transform(scale(X)) selectChi2 = SelectPercentile(chi2, percentile=p).fit(X_bin, y) selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y) chi2_selected = selectChi2.get_support() chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]] print('Chi2 selected {} features {}.'.format(chi2_selected.sum(), chi2_selected_features)) f_classif_selected = selectF_classif.get_support() f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]] print('F_classif selected {} features {}.'.format(f_classif_selected.sum(), f_classif_selected_features)) selected = chi2_selected & f_classif_selected print('Chi2 & F_classif selected {} features'.format(selected.sum())) features = [ f for f,s in zip(X.columns, selected) if s] print (features) return features
def univariant_feature_selection(self,method, X, y,percentile): test=SelectPercentile(method , percentile=percentile).fit(X, y) print("The number of feature in ", method, " is: ", (test.get_support().sum()) ) for i in range(len(self.X_train.columns)): if(test.get_support()[i]): print(self.X_train.columns[i]) return test.get_support()
def main(): parser = argparse.ArgumentParser(description='Feature Selection') required = parser.add_argument_group('required options') required.add_argument('-x', '--scaledfeaturelist', required=True, help='File containing feature values') required.add_argument('-y', '--targetdata', required=True, help='File containiing target data') required.add_argument('-z', '--fetpercentile', required=True, type=int, help='Percentile to select highest scoring percentage of features') args = parser.parse_args() X = np.loadtxt(args.scaledfeaturelist) Y = np.genfromtxt(args.targetdata,dtype='str') #result = SelectPercentile(f_classif, percentile=args.fetpercentile).fit_transform(X,Y) sel = SelectPercentile(f_classif, percentile=args.fetpercentile) result = sel.fit_transform(X,Y) #selecting features for test programs if os.path.isfile('variancefeatures.txt'): varianceFeature = np.genfromtxt("variancefeatures.txt", dtype='str') featureFromSelectPercentile = sel.get_support(indices=True) featureFileforSelectPercentile = open("featuresToTestPrograms","w") for i in featureFromSelectPercentile: featureFileforSelectPercentile.write(varianceFeature[i]) featureFileforSelectPercentile.write("\n") featureFileforSelectPercentile.close() #remove the variancefeatures as we don't need it anymore rm variancefeatures.txt np.savetxt('featurelist', result, fmt='%.2f', delimiter='\t')
def test_select_percentile_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the percentile heuristic """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect( f_regression, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r)) # Check inverse_transform respects dtype assert_array_equal(X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool)))
def final_feature_set_reduction(reduced_feature_file_name, final_file_name, train_label_file): sorted_train_data = pd.read_csv('data/' + reduced_feature_file_name) y = [] X = sorted_train_data.iloc[:,1:] fip = open('data/' + train_label_file) lines = fip.readlines() for line in lines: line = line.rstrip() y.append(int(line)) print("Final feature reduction: {:s}".format(reduced_feature_file_name)) print("Training labels length: {:d}".format(len(y))) print("X Feature set dimensionality: {:d} {:d}".format(X.shape[0], X.shape[1])) print("In Feature set dimensionality: {:d} {:d}".format(sorted_train_data.shape[0], sorted_train_data.shape[1])) # find the top 10 percent variance features, from ~1000 -> ~100 features fsp = SelectPercentile(chi2, 10) X_new_10 = fsp.fit_transform(X,y) print("Final 10 Percent Dimensions: {:d} {:d}".format(X_new_10.shape[0], X_new_10.shape[1])) selected_names = fsp.get_support(indices=True) selected_names = selected_names + 1 #data_reduced = sorted_train_data.iloc[:,[0] + selected_names] #Does not put the file_name as the first column. data_trimmed = sorted_train_data.iloc[:,selected_names] data_fnames = pd.DataFrame(sorted_train_data['file_name']) data_reduced = data_fnames.join(data_trimmed) data_reduced.to_csv('data/' + final_file_name, index=False) print("Completed reduction in {:s}".format(final_file_name)) return
def selectFeatures(features, labels, features_list): ''' Select features according to the 20th percentile of the highest scores. Return a list of features selected and a dataframe showing the ranking of each feature related to their p values features: numpy array with the features to be used to test sklearn models labels: numpy array with the real output features_list: a list of names of each feature ''' #feature selection selector = SelectPercentile(f_classif, percentile=20) selector.fit(features, labels) features_transformed = selector.transform(features) #filter names to be returned l_rtn = [x for x, t in zip(features_list, list(selector.get_support())) if t] # pd.DataFrame(features_transformed, columns = l_labels2).head() #calculate scores scores = -np.log10(selector.pvalues_) scores /= scores.max() df_rtn = pd.DataFrame(pd.Series(dict(zip(features_list,scores)))) df_rtn.columns = ["pValue_Max"] df_rtn = df_rtn.sort("pValue_Max", ascending=False) # df_rtn["different_from_zero"]=((df!=0).sum()*1./df.shape[0]) return l_rtn, df_rtn
def test_select_percentile_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_percentile_classif_sparse(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) X = sparse.csr_matrix(X) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X) assert_array_equal(X_r.toarray(), X_r2.toarray()) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_r2inv = univariate_filter.inverse_transform(X_r2) assert_true(sparse.issparse(X_r2inv)) support_mask = safe_mask(X_r2inv, support) assert_equal(X_r2inv.shape, X.shape) assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray()) # Check other columns are empty assert_equal(X_r2inv.getnnz(), X_r.getnnz())
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def test(X, y): ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_regression, percentile=20) selector.fit(X, y) print [zero_based_index for zero_based_index in list(selector.get_support(indices=True))]
def feature_selection(self,mode='F'): print 'Feature Selection...' print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') X=self.train.copy() y=self.train_label['label'].values.copy() test=self.test.copy() if mode.upper()=='M': mi=mutual_info_classif(train.values,train_label['label'].values) elif mode.upper()=='F': F,pval=f_classif(train.values,train_label['label'].values) elif mode.upper()=='C': chi,pval=chi2(train.values,train_label['label'].values) features=self.train.columns.copy() fs_features=features.copy().tolist() if mode.upper()=='M': fs_V=mi.copy().tolist() elif mode.upper()=='F': fs_V=F.copy().tolist() elif mode.upper()=='C': fs_V=chi.copy().tolist() if mode.upper()=='M': selector=SelectPercentile(mutual_info_classif,percentile=80) elif mode.upper()=='F': selector=SelectPercentile(f_classif,percentile=80) elif mode.upper()=='C': selector=SelectPercentile(chi2,percentile=80) X_new=selector.fit_transform(X,y) selected=selector.get_support() for i in xrange(len(features)): if selected[i]==False: t=features[i] fs_features.remove(t) fs_V=np.array(fs_V) fs_features=np.array(fs_features) self.train=pd.DataFrame(X_new,columns=fs_features.tolist()) self.test=test[fs_features] self.fs_features=fs_features feas=pd.DataFrame() feas['feature']=fs_features print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') return X_new,feas
def main(path,filename): #batchs = ['histogramaByN','histogramaColor','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12','patronesCirculaesByN_10_12'] batchs = ['patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12','patronesCirculaesByN_10_12'] #batchs = ['histogramaByN','histogramaColor','patrones2x2ByN','patronesCircularesByN_2_5','patronesCircularesByN_2_9','patronesCircularesByN_3_9','patronesCircularesByN_5_9','patronesCircularesByN_3_5'] #batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_3_5'] percentil = 20 X = [] y = [] lens = [] load_batch(y,path,'clases',filename) y = [j for i in y for j in i] for batch in batchs: load_batch(X,path,batch,filename) lens.append(len(X[0])) total = [lens[0]] for i in xrange(1,len(lens)): total.append(lens[i]-lens[i-1]) print 'Cantidad de atributos por barch' print total sp = SelectPercentile(chi2,percentil) X_new = sp.fit_transform(X, y) sup = sp.get_support(True) #print sup res = [0]* len(batchs) for i in sup: for j in xrange(0,len(lens)): if i <= lens[j]: res[j] +=1 break porcentajes = [] for i in xrange(0,len(lens)): porcentajes.append((1.0*res[i])/total[i]) print 'Cantidad de variables seleccionas en el'+str(percentil)+'percentil univariado' print res print 'Porcentaje de variables seleccionas en el'+str(percentil)+'percentil univariado' print porcentajes clf = ExtraTreesClassifier() clf = clf.fit(X, y) fi = clf.feature_importances_ res2 = [0]* len(batchs) for i in xrange(0,len(fi)): for j in xrange(0,len(lens)): if i <= lens[j]: res2[j] += fi[i] break print 'Importancia porcentual acumulada de la seleccion multivariada' print res2 porcentajes2 = [] for i in xrange(0,len(lens)): porcentajes2.append((1.0*res2[i])/total[i]) print 'Importancia porcentual promedio por variable de la seleccion multivariada' print porcentajes2
def train_type_model(): globals.read_configuration('config.cfg') parser = globals.get_parser() scorer_globals.init() datasets = ["webquestions_split_train", ] parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True) features = [] labels = [] for dataset in datasets: queries = get_evaluated_queries(dataset, True, parameters) for index, query in enumerate(queries): tokens = [token.lemma for token in parser.parse(query.utterance).tokens] n_grams = get_grams_feats(tokens) answer_entities = [mid for answer in query.target_result for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)] correct_notable_types = set(filter(lambda x: x, [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities])) other_notable_types = set() for candidate in query.eval_candidates: entities = [mid for entity_name in candidate.prediction for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)] other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities])) incorrect_notable_types = other_notable_types.difference(correct_notable_types) for type in correct_notable_types.union(incorrect_notable_types): if type in correct_notable_types: labels.append(1) else: labels.append(0) features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type")) with open("type_model_data.pickle", 'wb') as out: pickle.dump((features, labels), out) label_encoder = LabelEncoder() labels = label_encoder.fit_transform(labels) vec = DictVectorizer(sparse=True) X = vec.fit_transform(features) feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels) vec.restrict(feature_selector.get_support()) X = feature_selector.transform(X) type_scorer = SGDClassifier(loss='log', class_weight='auto', n_iter=1000, alpha=1.0, random_state=999, verbose=5) type_scorer.fit(X, labels) with open("type-model.pickle", 'wb') as out: pickle.dump((vec, type_scorer), out)
def selectFeatures(Model, X, y): model = Model() fsel = SelectPercentile(score_func=f_classif, percentile=5) fsel.fit(X, y) arr = fsel.get_support() print "features: ", np.where(arr == True) plt.hist(model.predict(X)) plt.hist(y) plt.show()
def featureSelection(reduced_features,labels,clnd_features,percentile,n_components,results=False): """ Parameters: reduced_features = Unique feature names in python list after dropping non-numeric feaures. labels = ground truth labels for the data points. clnd_features = data point features in numpy array format corresponding to the labels. percentile= the parameter for the SelectPercentile method; between 0.0-1.0. n_components = the n_components for the pca. results = False returns python list of selected features. If True returns the metrics of the feature selectors (F-statistic, and p-values from f_classif) and the top 'n' pca component variance measurements. Output: Resulting list of feature from the SelectPercentile function and the number of principle components used. If p_results = True then the statistics of the SelectPercentile method using f_classif will be printed. In addition the explained variance of the top 'x' principle components will also be printed. """ from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.decomposition import PCA from itertools import compress selector = SelectPercentile(f_classif, percentile=percentile) selector.fit_transform(clnd_features, labels) pca = PCA(n_components = n_components) pca.fit_transform(clnd_features, labels) if results == True: f_stat = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[0]),\ key = lambda x: x[1], reverse=True) p_vals = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[1]),\ key = lambda x: x[1]) expl_var = pca.explained_variance_ratio_ return f_stat,p_vals,expl_var else: ## return a boolean index of the retained features retained_features = selector.get_support() ## index the original features by the boolean index of top x% features ## return a python list of the features to be used for training features_list = list(compress(reduced_features[1:],retained_features)) ## add back in the 'poi' to the first position in the final features list features_list.insert(0,'poi') return features_list
def univariate_feature_selection(dataset, features): # load the dataset spreadsheet = Spreadsheet('../../Downloads/ip/project data.xlsx') data = Data(spreadsheet) targets = data.targets X = dataset y = data.targets ############################################################################### plt.figure(1) plt.clf() X_indices = np.arange(X.shape[-1]) ############################################################################### # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 10% most significant features selector = SelectPercentile(f_classif, percentile=10) selector.fit(X, y) scores = -np.log10(selector.pvalues_) scores /= scores.max() plt.bar(X_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)', color='g') ############################################################################### # Compare to the weights of an SVM clf = svm.SVC(kernel='linear') clf.fit(X, y) svm_weights = (clf.coef_ ** 2).sum(axis=0) svm_weights /= svm_weights.max() plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight', color='r') clf_selected = svm.SVC(kernel='linear') clf_selected.fit(selector.transform(X), y) svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0) svm_weights_selected /= svm_weights_selected.max() plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected, width=.2, label='SVM weights after selection', color='b') x = np.arange(0, len(features)) plt.title("Comparing feature selection") plt.xlabel('Feature number') plt.xticks(x, features, rotation=45) plt.yticks(()) #plt.axis('tight') plt.legend(loc='upper right') plt.show()
def PredictionScore (X_train,X_test,y_train,y_test,header): outFile = open('output.txt', 'a') from sklearn.svm import SVC from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer() names = ["Linear SVM","Nearest Neighbors", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes"] # names = ["Linear SVM","Linear SVM","Linear SVM","Linear SVM"] classifiers = [ SVC(kernel="linear", C=0.025), KNeighborsClassifier(3), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB()] # classifiers = [ # SVC(kernel="linear", C=0.025), # SVC(kernel="linear", C=0.02), # SVC(kernel="linear", C=0.01) # ] for name, clf in zip(names, classifiers): try: accuracy = 0.0 vec = DictVectorizer() fit = vec.fit(X_train) select = SelectPercentile(score_func=chi2,percentile=10).fit(fit.transform(X_train),y_train) fit.restrict (select.get_support()) X_train_counts = fit.transform(X_train) X_test_counts = fit.transform(X_test) # clf = SVC(kernel="linear", C=0.025) try: clf.fit(X_train_counts.toarray(), y_train) #predict = clf.predict(X_test_counts.toarray()) accuracy += clf.score(X_test_counts.toarray(),y_test) # coef = clf._get_coef() # print(np.argsort(coef)[-20:]) #for i in range(0,len(X_test)): #print (X_test[i]['ID']+"\t"+y_test[i]+"\t"+predict[i]) except BaseException as b: print (b) print (name+"\t"+"\t"+str(accuracy)) outFile.write(name+"\t"+"\t"+str(accuracy)+"\n") except BaseException as b: print (b) outFile.close()
def fTestFeatureSelection(train_files, train_labels, test_files, test_labels): design_matrix, features, _ = vectorizeTrain(train_files, None, 0, False, 0, None) classifier = LogisticRegression() for p in range(10): percentile = 100-p*10 print 'Selecting {0}% of features'.format(percentile) feat_sel = SelectPercentile(f_regression, percentile) X_sel = feat_sel.fit_transform(design_matrix, train_labels) f_inds = feat_sel.get_support(indices=True) print 'Using {0} features'.format(len(f_inds)) classifier.fit(X_sel, train_labels) test(test_files, test_labels, classifier, [features[d] for d in f_inds], None, 0, False, 0, None)
def test_select_percentile_regression_full(): # Test whether the relative univariate feature selection # selects all features when '100%' is asked. X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode="percentile", param=100).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth)
def select_with_chi2_and_f_classif(train): p = 3 train = train.drop('ID', 1) train_y = train['TARGET'] train_X = train.drop('TARGET', 1) X_bin = Binarizer().fit_transform(scale(train_X)) selectChi2 = SelectPercentile(chi2, percentile=p).fit(X_bin, train_y) selectF_classif = SelectPercentile(f_classif, percentile=p).fit(train_X, train_y) chi2_selected = selectChi2.get_support() chi2_selected_features = [ f for i,f in enumerate(train_X.columns) if chi2_selected[i]] print('Chi2 отобрал {} признаков {}.'.format(chi2_selected.sum(), chi2_selected_features)) f_classif_selected = selectF_classif.get_support() f_classif_selected_features = [ f for i,f in enumerate(train_X.columns) if f_classif_selected[i]] print('F_classif отобрал {} признаков {}.'.format(f_classif_selected.sum(), f_classif_selected_features)) selected = chi2_selected & f_classif_selected print('Chi2 & F_classif отобрали {} признаков'.format(selected.sum())) features = [ f for f,s in zip(train_X.columns, selected) if s] return features
def getFeatures(self, number_of_features=10): # X = self.training.iloc[:,:,-1] y = self.training['TARGET'] X = self.training.drop(['TARGET'], axis=1) #Select features according to the k highest scores. #selectFeatures = SelectBest(chi2, k= number_of_features) #Select the best 10 percentile # We can use other classifier as well for Selection like chi2 selectFeatures = SelectPercentile(f_classif, percentile= number_of_features) selectFeatures.fit(X, y) # X_select = selectFeatures.transform(X) features = selectFeatures.get_support(indices=True) # print("Best feature: "+ features[0]) return(features)
def test_select_percentile_4(): """Ensure that the TPOT select percentile outputs the same result as sklearn select percentile when 0 < percentile < 100""" tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UserWarning) selector = SelectPercentile(f_classif, percentile=42) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(tpot_obj._select_percentile(training_testing_data, 42), training_testing_data[mask_cols])
def get_top_chi2_candidate_ngrams(queries, f_extractor, percentile): """Get top ngrams features according to chi2. """ ngrams_dict = dict() features, labels = construct_examples(queries, f_extractor) label_encoder = LabelEncoder() labels = label_encoder.fit_transform(labels) vec = DictVectorizer(sparse=True) X = vec.fit_transform(features) # ch2 = SelectKBest(chi2, k=n_features) ch2 = SelectPercentile(chi2, percentile=percentile) ch2.fit(X, labels) indices = ch2.get_support(indices=True) for i in indices: ngrams_dict[vec.feature_names_[i]] = 1 return ngrams_dict
def featureSelection(self,X,y): ''' Feature selection recursive feature elimination with Linear SVM. :param: a. X the training matrix. b. y the labels column corresponding the X. :return: a. The mask of top 10% features using. b. The transformed training matrix ''' print np.shape(X) selector = SelectPercentile(chi2, percentile=10) X_new = selector.fit_transform(X, y) return X_new, selector.get_support()
def feature_reduction_percent(percentage, train_data_df, train_labels_df): # TODO: everythong X = train_data_df.iloc[:,1:] y = np.array(train_labels_df.iloc[:,1]) # find the top percent variance features. fsp = SelectPercentile(chi2, percentage) X_reduced = fsp.fit_transform(X,y) selected_names = fsp.get_support(indices=True) selected_names = selected_names + 1 data_trimmed = sorted_train_data.iloc[:,selected_names] data_fnames = pd.DataFrame(sorted_train_data['filename']) data_reduced = data_fnames.join(data_trimmed) data_reduced.to_csv('data/sorted-train-malware-features-asm-50percent.csv', index=False) return
def determine_percentile(): max_snr = list() for i in np.arange(5, 20): select = SelectPercentile(chi2, percentile=i) select.fit(np.abs(traces), np.reshape(Y, (Queries,))) indexes = select.get_support(True) print(indexes) filter_traces = np.zeros(np.shape(traces), np.complex128) filter_traces[:, indexes] = traces[:, indexes] filter_traces = np.fft.ifft(filter_traces, n=SAMPLES, axis=1) snr_t = np.abs(SNR.SNR(filter_traces, Y, 256, SAMPLES, np.complex128)) max_snr.append(np.max(snr_t[300:1300])) fig, ax = plt.subplots() ax.plot(np.arange(10, 20), max_snr) ax.set_title("max SNR vs feature selection FFT percentile") ax.set_xlabel("percentile") plt.show()
def best_features(train, test, perc): temp_trans = OrdinalEncoder(dtype='int') train[['protocol_type', 'service', 'flag', 'target']] = temp_trans.fit_transform( train[['protocol_type', 'service', 'flag', 'target']]) trans = SelectPercentile(f_classif, percentile=perc) trans.fit(train.drop('target', axis='columns'), train['target']) train[['protocol_type', 'service', 'flag', 'target']] = temp_trans.inverse_transform( train[['protocol_type', 'service', 'flag', 'target']]) eliminated_columns = trans.get_support() bad_features = [] for i in range(len(eliminated_columns)): if not eliminated_columns[i]: bad_features.append(train.columns[i]) train.drop(bad_features, axis='columns', inplace=True) test.drop(bad_features, axis='columns', inplace=True) return train, test
def percentile_k_features(df, k=20): y = df['SalePrice'] X = df.loc[:, data.columns != 'SalePrice'] kpsec = SelectPercentile(score_func=f_regression, percentile=k) percentileCols = kpsec.fit_transform(X, y) getIndices = np.asarray(kpsec.get_support(indices=True)) scores = kpsec.scores_ sorted_scores = np.argsort(scores)[::-1] list_cols = [] for ind in sorted_scores: if (ind in getIndices): list_cols.append(X.columns[ind]) #print(list_cols) return list_cols
def get_combined_separate_fsets(feature_sets, fs_fn='pct', ptile=10, nFeatures=5, score_fn=f_classif): df_lst = [] for fset_name, df in feature_sets.items(): X_train = df[df.partition == 'train'].drop(['partition', 'fatality_ind'], axis=1) y_train = df[df.partition == 'train'].fatality_ind df_X = df.drop(['partition', 'fatality_ind'], axis=1) if fs_fn == 'pct': featureSelector = SelectPercentile(score_func=score_fn, percentile=ptile) else: featureSelector = SelectKBest(score_func=score_fn, k=nFeatures) featureSelector.fit(X_train, y_train) fs = featureSelector.transform(df.drop(['partition', 'fatality_ind'], axis=1)) cols_fs = df_X.columns[list(featureSelector.get_support(indices=True))] cols_fs_ref = [fset_name + ' ' + c for c in cols_fs] df_fs = pd.DataFrame(fs, index=df_X.index, columns=cols_fs_ref) df_lst.append(df_fs) df_comb = df[['partition', 'fatality_ind']].join(pd.concat(df_lst, axis=1)) return df_comb
def test_select_percentile_regression_full(): # Test whether the relative univariate feature selection # selects all features when '100%' is asked. X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=100) X_r = univariate_filter.fit(X, y).transform(X) assert_best_scores_kept(univariate_filter) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=100).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.ones(20) assert_array_equal(support, gtruth)
def addTagsMatrix(df, **params): ''' 将用户标签转换成稀疏矩阵 ''' startTime = datetime.now() cv = CountVectorizer(min_df=0.001, max_df=0.8, binary=True) cv.fit(df['user_tags'].dropna()) tagSelecter = SelectPercentile(chi2, percentile=10) tagSelecter.fit(cv.transform(df[df.flag >= 0]['user_tags'].fillna("")), df[df.flag >= 0]['click']) tagsMatrix = tagSelecter.transform(cv.transform( df['user_tags'].fillna(""))) tagsName = np.array(cv.get_feature_names())[tagSelecter.get_support()] tempDf = pd.DataFrame(tagsMatrix.toarray(), columns=['tag_' + str(x) for x in tagsName], index=df.index) df = pd.concat([df, tempDf], axis=1) print('tag matrix time:', datetime.now() - startTime) return df
def test_select_percentile_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the percentile heuristic X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_classif, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def data(): d1 = inputParser("data-2018-01-14-neworleans.csv") d2 = inputParser("data-2018-04-01-birmingham.csv") splitData(d1) splitData(d2) eliminator = SelectPercentile(mutual_info_classif, percentile=30) #eliminator = SelectKBest(mutual_info_classif, k=1) newDataMat = eliminator.fit_transform(dataMat, classLabel) used = (eliminator.get_support()) output = "[" for x in range(len(used)): if used[x]: output += colLabel[x + len(rowId[0])] + ", " print("factors used:") print(output[0:-2] + "]") print() filter(used) return [colLabelRelevent, dataRelevant, classLabel]
def select_features(X, y, keep_percentage, features_outfile=None): feature_names = X.columns feature_finder = SelectPercentile(chi2, percentile=keep_percentage) X = feature_finder.fit_transform(X, y) support = feature_finder.get_support() feature_names = feature_names[support] if features_outfile is not None: scores = feature_finder.scores_ pvals = feature_finder.pvalues_ feature_scores = scores[support] feature_pvals = pvals[support] feature_data = zip(feature_names, feature_scores, feature_pvals) # Sort features by score. ranked = sorted(feature_data, key=lambda x: x[1], reverse=True) with open(features_outfile, 'w') as outF: for feat in ranked: outF.write("{} :: {:g} :: {:g}\n".format(*feat)) print("Saved features to {}".format(features_outfile)) return X, feature_names
def get_drop_columns_on_percentile_based_feature_selection( train, percent, all_cols): print('Percentile based feature selection:', percent) X = train.drop(["isFraud", 'TransactionID', 'TransactionDT'], axis=1) y = train["isFraud"] selector_f = SelectPercentile(f_classif, percentile=percent) X_best = selector_f.fit_transform(X, y) support = np.asarray(selector_f.get_support()) #top 20% features features = np.asarray(X.columns.values) features_with_support = features[support] #top 20% f-scores fscores = np.asarray(selector_f.scores_) fscores_with_support = fscores[support] #top 20% p-values pvalues = np.asarray(selector_f.pvalues_) pvalues_with_support = fscores[support] top_features = pd.DataFrame( { "F-Score": fscores_with_support, "P-Value": pvalues_with_support }, index=features_with_support) print( "Top features best associated features with Y\n Number of features", len(features_with_support)) print( top_features.sort_values(by='P-Value', ascending=True, inplace=True)) print('Done!') final_features = top_features.index.values.tolist() droppable_cols = [] for j in range(len(all_cols)): if all_cols[j] not in final_features: droppable_cols.append(all_cols[j]) print(len(droppable_cols)) print("Columns to drop:\n", droppable_cols) return droppable_cols
def fit(self, X, y): ''' Inputs: ------- X: a dataframe y: a series ''' relevance = SelectPercentile(f_classif, percentile=self.percentile) feature_relevant = relevance.fit_transform(X, y.values.ravel()) idx_most_relevant = relevance.get_support() names_most_relevant = X.columns[idx_most_relevant] scores = -np.log10(relevance.pvalues_[idx_most_relevant]) scores /= scores.max() self.scores = scores self.relevant_features = names_most_relevant pass
def percentile_k_features(df, k=20): # X = df.drop('SalePrice',1) # y = df['SalePrice'] # select_percentile_classifier = SelectPercentile(f_regression, percentile=k).fit(X, y) # mask = select_percentile_classifier.get_support() #list of booleans # new_features = [] # for bool, feature in zip(mask, X.columns): # if bool: # new_features.append(feature) #alternate code x = data.iloc[:,:-1] y = data.iloc[:,-1] a = SelectPercentile(f_regression, percentile = 20).fit(x,y) # return a[2] ids = a.get_support(indices = True) k_features = data.iloc[:,ids].columns expected = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath'] return expected
def test_select_percentile_4(): """Ensure that the TPOT select percentile outputs the same result as sklearn select percentile when 0 < percentile < 100""" tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[ training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) training_class_vals = training_testing_data.loc[ training_testing_data['group'] == 'training', 'class'].values with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UserWarning) selector = SelectPercentile(f_classif, percentile=42) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list( training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal( tpot_obj._select_percentile(training_testing_data, 42), training_testing_data[mask_cols])
def test_select_percentile_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the percentile heuristic """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectPercentile(f_regression, percentile=25) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='percentile', param=25).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth) X_2 = X.copy() X_2[:, np.logical_not(support)] = 0 assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
def _select_percentile(self, input_df, percentile): """Uses Scikit-learn's SelectPercentile feature selection to learn the subset of features that belong in the highest `percentile` according to a given scoring function Parameters ---------- input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']} Input DataFrame to perform feature selection on percentile: int The features that belong in the top percentile to keep from the original set of features in the training data Returns ------- subsetted_df: pandas.DataFrame {n_samples, n_filtered_features + ['guess', 'group', 'class']} Returns a DataFrame containing the best features in the given `percentile` """ training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1) training_class_vals = input_df.loc[input_df['group'] == 'training', 'class'].values if percentile < 0: percentile = 0 elif percentile > 100: percentile = 100 if len(training_features.columns.values) == 0: return input_df.copy() with warnings.catch_warnings(): # Ignore warnings about constant features warnings.simplefilter('ignore', category=UserWarning) selector = SelectPercentile(f_classif, percentile=percentile) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + ['guess', 'class', 'group'] return input_df[mask_cols].copy()
def PreProcessing(train, test, relation, prun_off_threshold, percent, ex_fac): #including instance weighting, necessary data preparation # assign label for each kind of relation, compose balanced data set train, instance_weight = Data_preparation(train, True, ex_fac, relation) test = Data_preparation(test, False, ex_fac, relation) #print "data preparation done" # convert feature list to feature string, for both baseline feature and my own feature. X_train, y_train = [x["features"] for x in train], [x["Sense"] for x in train] X_test, y_test = [x["features"] for x in test], [ x["clf_label"] for x in test ] # y_test is actually useless, I put it here just to fit the nltk data structure #print "feature string done" vectorizer = CountVectorizer(min_df=prun_off_threshold, token_pattern='[^ ]{1,}', binary=True, lowercase=False) vectorizer.fit_transform(X_train) X_train_vec = vectorizer.transform(X_train) X_test_vec = vectorizer.transform(X_test) #print "data transformation done" #here, we can do some feature selection selection = SelectPercentile(chi2, percent).fit(X_train_vec, y_train) X_train_selected = selection.transform(X_train_vec) X_test_selected = selection.transform(X_test_vec) selected_index = selection.get_support(True) #print "feature selection done" feature_list = vectorizer.get_feature_names() selected_feature_list = [feature_list[x] for x in selected_index] #print "original data scale: "+ str(X_train_vec.shape) #print "feature dimension after selection: "+ str(len(selected_feature_list)) #155570 return X_train_selected, y_train, X_test_selected, instance_weight, X_train, X_test, y_test, selected_feature_list, test
def tfidf_features(data, y, keep_percentage=100, ngram_range=(1, 1), binary=False): sentences = data["SENTENCE"].values y = y.ravel() vectorizer = TfidfVectorizer(ngram_range=ngram_range, stop_words="english", token_pattern=r'(?u)\b[\w-][\w-]+\b', binary=binary) X = vectorizer.fit_transform(sentences).toarray() feature_names = np.array(vectorizer.get_feature_names()) print(X.shape) if keep_percentage < 100: feature_finder = SelectPercentile(f_classif, percentile=keep_percentage) X = feature_finder.fit_transform(X, y) print("After feature selection: {}".format(X.shape)) support = feature_finder.get_support() feature_names = feature_names[support] feature_names = ['"{}"'.format(f) for f in feature_names] return X, feature_names
def final_feature_set_reduction(reduced_feature_file_name, final_file_name, train_label_file): sorted_train_data = pd.read_csv('data/' + reduced_feature_file_name) y = [] X = sorted_train_data.iloc[:, 1:] fip = open('data/' + train_label_file) lines = fip.readlines() for line in lines: line = line.rstrip() y.append(int(line)) print("Final feature reduction: {:s}".format(reduced_feature_file_name)) print("Training labels length: {:d}".format(len(y))) print("X Feature set dimensionality: {:d} {:d}".format( X.shape[0], X.shape[1])) print("In Feature set dimensionality: {:d} {:d}".format( sorted_train_data.shape[0], sorted_train_data.shape[1])) # find the top 10 percent variance features, from ~1000 -> ~100 features fsp = SelectPercentile(chi2, 10) X_new_10 = fsp.fit_transform(X, y) print("Final 10 Percent Dimensions: {:d} {:d}".format( X_new_10.shape[0], X_new_10.shape[1])) selected_names = fsp.get_support(indices=True) selected_names = selected_names + 1 #data_reduced = sorted_train_data.iloc[:,[0] + selected_names] #Does not put the file_name as the first column. data_trimmed = sorted_train_data.iloc[:, selected_names] data_fnames = pd.DataFrame(sorted_train_data['file_name']) data_reduced = data_fnames.join(data_trimmed) data_reduced.to_csv('data/' + final_file_name, index=False) print("Completed reduction in {:s}".format(final_file_name)) return
class UnivariateFeatureSelection: def __init__(self, n_features, problem_type, scoring): if problem_type == "classification": valid_scoring = { "f_classifs": f_classif, "chi2": chi2, "mutual_info_classif": mutual_info_classif } elif problem_type == "regression": valid_scoring = { "f_regression": f_regression, "mutual_info_regression": mutual_info_regression } if scoring not in valid_scoring: raise Exception( f"Invalid scoring. Options are: {valid_scoring.keys()}") if isinstance(n_features, int): self.selection = SelectKBest(valid_scoring[scoring], k=n_features) elif isinstance(n_features, float): self.selection = SelectPercentile(valid_scoring[scoring], percentile=int(n_features * 100)) else: raise Exception("Invalid n_features. It should be float or int.") def fit(self, X, y): return self.selection.fit(X, y) def transform(self, X): return self.selection.transform(X) def fit_transform(self, X, y): return self.selection.fit_transform(X, y) def get_support(self): return self.selection.get_support()
def train(self, train_data, train_labels, classes, feature_selection=False, percentile=100, batch_size=1000): if feature_selection: selector = SelectPercentile(chi2, percentile=percentile) X = selector.fit_transform( self.vectorizer.fit_transform(train_data), train_labels) new_vocab = list( np.array(self.vectorizer.vocabulary)[selector.get_support()]) self.vectorizer = TfidfVectorizer(dtype=np.float32, vocabulary=new_vocab) print(len(self.vectorizer.vocabulary)) for i in range(0, train_data.size, batch_size): print(i) data = train_data[i:i + batch_size] X = self.vectorizer.fit_transform(data).toarray() # self.clf.partial_fit(X, train_labels[i:i+batch_size], classes=classes) self.clf.partial_fit(X, train_labels[i:i + batch_size], classes=classes)
def learning_of_bread(data): # 对星级进行二值化 data[['Star']] = pre.Binarizer(threshold=39).transform(data[['Star']]) # print('Star二值化结果:\n', data['Star'].values) # 将菜系类别变量转换成数值变量 data['Cuisine'] = pre.LabelEncoder().fit_transform(data['Cuisine']) # print('菜系变量转数值变量:\n', data['Cuisine'].values) # 选取特征和标签 features = data[[ 'Cuisine', 'Comments', 'Per_Consumption', 'Taste', 'Environment', 'Service' ]].values label = data['Star'].values # 选取重要性特征 fea_select = SP(percentile=85) fea_select.fit(features, label) # print(fea_select.get_support()) # print(fea_select.scores_) fea_new = features[:, fea_select.get_support()] # print(fea_new) # 特征归一化处理 stand_fea = pre.MinMaxScaler().fit_transform(fea_new) # print(stand_fea) return stand_fea, label
def features_selection(x_train, y_train, featurs_selection, percent): # 选择特征 vectorizer = TfidfVectorizer() vectors_train = vectorizer.fit_transform(x_train) features_names = vectorizer.get_feature_names() if percent == 1.0: return features_names # num_features_selected = int(vectors_train.shape[1] * 0.05) if featurs_selection in ['chi2', 'mutual_info_classif']: selection = SelectPercentile(eval(featurs_selection), percentile=int(percent * 100)) selection.fit(vectors_train, y_train) features_names_selected =\ [features_names[k] for k in selection.get_support(indices=True)] elif featurs_selection in ['WLLR', 'IG', 'MI']: features_names_selected = prepocessing_bugs.feature_selection( [doc.split() for doc in x_train], y_train, featurs_selection, percent) print('sklearn select features: %d' % len(features_names_selected)) print(features_names_selected[:10]) return features_names_selected
def select_features(self, method): if method == 'SelectPercentile': select = SelectPercentile(percentile=50) elif method == 'SelectKBest': select = SelectKBest(chi2, k=2) elif method == 'VarianceThreshold': select = VarianceThreshold(threshold=(.8 * (1 - .8))) elif method == 'TreeBased': select = SelectFromModel(RandomForestClassifier(), threshold='median') elif method == 'L1Based': select = SelectFromModel(LinearSVC(C=0.01, penalty='l1', dual=False), threshold='median') else: sys.exit('Method name not valid') # Fit the selector select.fit(self.data, self.labels) # Apply to features self.data = select.transform(self.data) print('Feature selection using %s method:' % method) mask = select.get_support() print(mask) self.mu = self.mu[mask] self.sigma = self.sigma[mask] self.features = self.features[mask]
### Used feature selection thru SelectBestK and SelectPercentile methods. ### I used f_classif since I used features and labes and classification models. X = features y = labels K = 5 P = 50 print "ORIGINAL FEATURES:" print features_list[1:] print print "INTELLIGENT FEATURE SELECTION:" selector1 = SelectPercentile(f_classif, percentile=P) selector1.fit(X, y) mask1 = selector1.get_support() new_features1 = [] for bool, feature in zip(mask1, features_list[1:]): if bool: new_features1.append(feature) print "SelectPercentile:(percentaje=50)", new_features1 selector2 = SelectKBest(f_classif, k=K) selector2.fit(X, y) mask2 = selector2.get_support() new_features2 = [] for bool, feature in zip(mask2, features_list[1:]): if bool: new_features2.append(feature) print "SelectKBest(k=5):", new_features2
from sklearn.datasets import load_breast_cancer from sklearn.feature_selection import SelectPercentile from sklearn.model_selection import train_test_split cancer = load_breast_cancer() rng = np.random.RandomState(42) noise = rng.normal(size=(len(cancer.data), 50)) X_w_noise = np.hstack([cancer.data, noise]) X_train, X_test, y_train, y_test = train_test_split(X_w_noise, cancer.target, random_state=0, test_size=.5) select = SelectPercentile(percentile=50) select.fit(X_train, y_train) X_train_selected = select.transform(X_train) mask = select.get_support() plt.matshow(mask.reshape(1, -1), cmap='gray_r') plt.xlabel("Sample index") plt.show() from sklearn.linear_model import LogisticRegression X_test_selected = select.transform(X_test) lr = LogisticRegression() lr.fit(X_train, y_train) print("All features: {:.3f}".format(lr.score(X_test, y_test))) lr.fit(X_train_selected, y_train) print("Selected features: {:.3f}".format(lr.score(X_test_selected, y_test)))
def _univariate_feature_screening( X, y, mask, is_classif, screening_percentile, smoothing_fwhm=2.): """ Selects the most import features, via a univariate test Parameters ---------- X : ndarray, shape (n_samples, n_features) Design matrix. y : ndarray, shape (n_samples,) Response Vector. mask: ndarray or booleans, shape (nx, ny, nz) Mask definining brain Rois. is_classif: bool Flag telling whether the learning task is classification or regression. screening_percentile : float in the closed interval [0., 100.] Only the `screening_percentile * 100" percent most import voxels will be retained. smoothing_fwhm : float, optional (default 2.) FWHM for isotropically smoothing the data X before F-testing. A value of zero means "don't smooth". Returns ------- X_: ndarray, shape (n_samples, n_features_) Reduced design matrix with only columns corresponding to the voxels retained after screening. mask_ : ndarray of booleans, shape (nx, ny, nz) Mask with support reduced to only contain voxels retained after screening. support : ndarray of ints, shape (n_features_,) Support of the screened mask, as a subset of the support of the original mask. """ # smooth the data (with isotropic Gaussian kernel) before screening if smoothing_fwhm > 0.: sX = np.empty(X.shape) for sample in range(sX.shape[0]): sX[sample] = ndimage.gaussian_filter( _unmask(X[sample].copy(), # avoid modifying X mask), (smoothing_fwhm, smoothing_fwhm, smoothing_fwhm))[mask] else: sX = X # do feature screening proper selector = SelectPercentile(f_classif if is_classif else f_regression, percentile=screening_percentile).fit(sX, y) support = selector.get_support() # erode and then dilate mask, thus obtaining a "cleaner" version of # the mask on which a spatial prior actually makes sense mask_ = mask.copy() mask_[mask] = (support > 0) mask_ = ndimage.binary_dilation(ndimage.binary_erosion( mask_)).astype(np.bool) mask_[np.logical_not(mask)] = 0 support = mask_[mask] X = X[:, support] return X, mask_, support
def use_pipeline_with_feature_selection(self): ##################### # Build a classifier pipeline and carry out feature selection and grid search for best classif parameters ##################### pipeline = Pipeline([("selector", SelectPercentile()), ('clf', SGDClassifier(random_state=42))]) # Build a grid search to find the best parameter # Fit the pipeline on the training set using grid search for the parameters parameters = { 'selector__score_func': (chi2, f_classif), 'selector__percentile': (85, 95, 100), 'clf__loss': ('hinge', 'log'), 'clf__penalty': ('l2', 'l1', 'elasticnet'), 'clf__n_iter': (5, 10), 'clf__alpha': (0.001, 0.0001, 0.0005), } ################# # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations # of parameter values are evaluated and the best combination is retained. ################# cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, random_state=42) grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=cv, n_jobs=-1) clf_gs = grid_search.fit(x_train, y_train) ############### # print the cross-validated scores for the each parameters set explored by the grid search ############### best_parameters, score, _ = max(clf_gs.grid_scores_, key=lambda x: x[1]) for param_name in sorted(parameters.keys()): print("%s: %r" % (param_name, best_parameters[param_name])) print("Score for gridsearch is %0.2f" % score) ############### # run the classifier again with the best parameters # in order to get 'clf' for get_important_feature function! ############### score_func = best_parameters['selector__score_func'] percentile = best_parameters['selector__percentile'] loss = best_parameters['clf__loss'] penalty = best_parameters['clf__penalty'] alpha = best_parameters['clf__alpha'] ################# # feature selection ################# selector = SelectPercentile(score_func=score_func, percentile=percentile) combined_features = Pipeline([("feat_select", selector)]) X_features = combined_features.fit_transform(x_train, y_train) X_test_features = combined_features.transform(x_test) print("Shape of train data after feature selection is " + str(X_features.shape)) print("Shape of test data after feature selection is " + str(X_test_features.shape)) # run classifier on selected features clf = SGDClassifier(loss=loss, penalty=penalty, alpha=alpha, random_state=42).fit(X_features, y_train) # get the features which are selected and write to file feature_boolean = selector.get_support(indices=False) f = open(path_to_store_feature_selection_boolean_file, 'w') for fb in feature_boolean: f.write(str(fb) + '\n') f.close() ################## # get cross validation score ################## scores = cross_val_score(clf, X_features, y_train, cv=10, scoring='f1_weighted') print("Cross validation score: " + str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ################## # run classifier on test data ################## y_predicted = clf.predict(X_test_features) # print the mean accuracy on the given test data and labels print("Classifier score on test data is: %0.2f " % clf.score(X_test_features, y_test)) # Print and plot the confusion matrix print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) return clf
def train_classifier_use_feature_selection(self): ################# # feature selection ################# selector = SelectPercentile(score_func=_score_func, percentile=_percentile) print("Fitting data with feature selection ...") selector.fit(x_train, y_train) # get how many features are left after feature selection x_features = selector.transform(x_train) print("Shape of array after feature selection is " + str(x_features.shape)) clf = SGDClassifier(loss=_loss, penalty=_penalty, alpha=_alpha, n_iter=_n_iter, random_state=42).fit(x_features, y_train) # get the features which are selected and write to file feature_boolean = selector.get_support(indices=False) ################## # get cross validation score ################## scores = cross_val_score(clf, x_features, y_train, cv=10, scoring='f1_weighted') print("Cross validation score: " + str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #################### # test clf on test data #################### # apply feature selection on test data too x_test_selector = selector.transform(x_test) print("Shape of array for test data after feature selection is " + str(x_test_selector.shape)) y_predicted = clf.predict(x_test_selector) # print the mean accuracy on the given test data and labels print("Classifier score on test data is: %0.2f " % clf.score(x_test_selector, y_test)) print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) return clf
x = pd.DataFrame(x.todense(), columns=ft) del X_test['Voter_name'] X_test = X_test.join(x, rsuffix='2', lsuffix='1') X_test.fillna(0).to_sparse(fill_value=0) from sklearn.decomposition import PCA # pca=PCA(n_components=140) # pca.fit(X_train) # X_train=pca.transform(X_train) # X_test=pca.transform(X_test) from sklearn.feature_selection import SelectPercentile, f_classif percentile = SelectPercentile(percentile=20) X_train = percentile.fit_transform(X_train, Y_train) selected_features = percentile.get_support(True) X_test = X_test.iloc[:, selected_features] print(selected_features) # import xgboost as xgb # mod = xgb.XGBClassifier() # mod.fit(X_train, Y_train) # print('xgb', mod.score(X_test, Y_test)) # exit(0) # from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier # # mod =AdaBoostClassifier() # mod.fit(X_train,Y_train)
# Import various select transforms along with the f_regression mode from sklearn.feature_selection import SelectPercentile, f_regression # Implement and print SelectPercentil we will take 50% of 6 independent # features i.e., 3 selectorP = SelectPercentile(score_func=f_regression, percentile=50) x_p = selectorP.fit_transform(X, Y) # Get f_score and p_values for the selected features f_score = selectorP.scores_ p_values = selectorP.pvalues_ # Print the f_score and p_values # Print the table of Features, F-Score and P-values columns = list(X.columns) print("\n\n ") print(" Features ", "F-Score ", "P-Values") print(" ----------- --------- ---------") for i in range(0, len(columns)): f1 = "%4.2f" % f_score[i] p1 = "%2.6f" % p_values[i] print(" ", columns[i].ljust(12), f1.rjust(8), " ", p1.rjust(8)) cols = selectorP.get_support(indices=True) selectedCols = X.columns[cols].to_list() print(selectedCols)
X = training.iloc[:,:-1] y = training.TARGET X['n0'] = (X == 0).sum(axis=1) from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import f_classif,chi2 from sklearn.preprocessing import Binarizer, scale p = 30 X_bin = Binarizer().fit_transform(scale(X)) selectChi2 = SelectPercentile(chi2, percentile=p).fit(X_bin, y) selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y) chi2_selected = selectChi2.get_support() chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]] print('Chi2 selected {} features {}.'.format(chi2_selected.sum(), chi2_selected_features)) f_classif_selected = selectF_classif.get_support() f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]] print('F_classif selected {} features {}.'.format(f_classif_selected.sum(), f_classif_selected_features)) selected = chi2_selected & f_classif_selected print('Chi2 & F_classif selected {} features'.format(selected.sum())) features = [ f for f,s in zip(X.columns, selected) if s] print (features) X_sel = X[features] # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_sel,
# Compare to the weights of an SVM clf = svm.SVC(kernel="linear") clf.fit(X, y) svm_weights = (clf.coef_ ** 2).sum(axis=0) svm_weights /= svm_weights.max() plt.bar(X_indices - 0.25, svm_weights, width=0.2, label="SVM weight", color="navy") clf_selected = svm.SVC(kernel="linear") clf_selected.fit(selector.transform(X), y) svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0) svm_weights_selected /= svm_weights_selected.max() plt.bar( X_indices[selector.get_support()] - 0.05, svm_weights_selected, width=0.2, label="SVM weights after selection", color="c", ) plt.title("Comparing feature selection") plt.xlabel("Feature number") plt.yticks(()) plt.axis("tight") plt.legend(loc="upper right") plt.show()
def _univariate_feature_screening( X, y, mask, is_classif, screening_percentile, smoothing_fwhm=2.): """ Selects the most import features, via a univariate test Parameters ---------- X : ndarray, shape (n_samples, n_features) Design matrix. y : ndarray, shape (n_samples,) Response Vector. mask: ndarray or booleans, shape (nx, ny, nz) Mask defining brain Rois. is_classif: bool Flag telling whether the learning task is classification or regression. screening_percentile : float in the closed interval [0., 100.] Only the `screening_percentile * 100" percent most import voxels will be retained. %(smoothing_fwhm)s Default=2. Returns ------- X_: ndarray, shape (n_samples, n_features_) Reduced design matrix with only columns corresponding to the voxels retained after screening. mask_ : ndarray of booleans, shape (nx, ny, nz) Mask with support reduced to only contain voxels retained after screening. support : ndarray of ints, shape (n_features_,) Support of the screened mask, as a subset of the support of the original mask. """ # smooth the data (with isotropic Gaussian kernel) before screening if smoothing_fwhm > 0.: sX = np.empty(X.shape) for sample in range(sX.shape[0]): sX[sample] = ndimage.gaussian_filter( _unmask_from_to_3d_array(X[sample].copy(), # avoid modifying X mask), (smoothing_fwhm, smoothing_fwhm, smoothing_fwhm))[mask] else: sX = X # do feature screening proper selector = SelectPercentile(f_classif if is_classif else f_regression, percentile=screening_percentile).fit(sX, y) support = selector.get_support() # erode and then dilate mask, thus obtaining a "cleaner" version of # the mask on which a spatial prior actually makes sense mask_ = mask.copy() mask_[mask] = (support > 0) mask_ = ndimage.binary_dilation(ndimage.binary_erosion( mask_)).astype(bool) mask_[np.logical_not(mask)] = 0 support = mask_[mask] X = X[:, support] return X, mask_, support
def train(data): test_data = data ## aggregates the predictions on classifiers over the four corpuses into one target_psfl = test_data.loc[:, test_data.columns == 'psfl'] target_zh = test_data.loc[:, test_data.columns == 'zh'] target_wiki = test_data.loc[:, test_data.columns == 'wiki'] target_brescola = test_data.loc[:, test_data.columns == 'brescola'] # ensures the lengths of the target data line up length = len(target_brescola) if length != len(target_zh) or length != len(target_wiki) or length != len( target_psfl): print("ERROR: Lengths of four targets are not all the same.") ## aggregates the results into one column difficulties = [] # for each observation... for i in range(length): results = [ target_psfl.values[i], target_zh.values[i], target_wiki.values[i], target_brescola.values[i] ] # takes the maximum occurrence, defaulting towards "difficult" in case of tie s = 0 d = 0 for res in results: if res == 'd': d += 1 elif res == 's': s += 1 else: print('ERROR: Target value not \'d\' nor \'s\'.') # TO-WRITE: # if using the weighted system, tiebreaking in favor of d brings down overall accuracy but balances between labels # doing the opposite brings up accuracy and f1 but heavily leans in favor of predicting for 's' # just using psfl brings up both scores -> DISCUSS THIS -> why might this be? maybe those docs are all from psfl??? # TODO - Plot the differences here! if d >= s: difficulties.append('d') else: difficulties.append('s') ## partitions data into features and target, dropping old target columns as well as docid column data = test_data.drop("psfl", axis=1) data = data.drop("zh", axis=1) data = data.drop("wiki", axis=1) data = data.drop("brescola", axis=1) data = data.drop("docid", axis=1) # sets the target data according to user input print( 'Please select the corpora to use as ground truth: 0 - PSFL, 1 - ZH, 2 - Wiki, 3 - BrEscola, 4 - Weighted Average' ) temp = float(input()) if (temp == 0): target = target_psfl.values.reshape(-1, ).tolist() elif (temp == 1): target = target_zh.values.reshape(-1, ).tolist() elif (temp == 2): target = target_wiki.values.reshape(-1, ).tolist() elif (temp == 3): target = target_brescola.values.reshape(-1, ).tolist() elif (temp == 4): target = difficulties # partitions data into training and test sets X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2) # partitions the training set into a general set (for use in the model) and a tuning set (for use in refining other components) X_general, X_tuning, y_general, y_tuning = train_test_split(X_train, y_train, test_size=0.3) X_tuning_copy = X_tuning.copy() # Uses GridSearchCV to determine the right kernel #, degree=[2, 3, 4] model = RandomForestClassifier() grid = GridSearchCV(estimator=model, param_grid=dict(n_estimators=[10, 50, 100, 200], criterion=['gini', 'entropy'], max_depth=[None, 1, 10, 100])) grid.fit(X_tuning, y_tuning) op_estims = grid.best_estimator_.n_estimators op_crit = grid.best_estimator_.criterion op_depth = grid.best_estimator_.max_depth print('Optimal Params (FOREST):', op_estims, op_crit, op_depth) ## performs feature selection using other scikit libraries # TODO - Average all metrics out over many rounds for better selection # TODO - Select only a couple of metrics print('FEATURE SELECTION METRICS (FOREST):') for j in range(20, 80, 5): index = X_tuning.index.tolist() cols = X_tuning.columns.tolist() # creates selector for given percentile of features selector = SelectPercentile(percentile=j) X_new = selector.fit_transform(X_tuning, y_tuning) temp = selector.get_support(True) X_tuning = pd.DataFrame(data=X_tuning, index=index, columns=[cols[i] for i in temp]) # splits tuning set into training and testing data X_train_iter, X_test_iter, y_train_iter, y_test_iter = train_test_split( X_tuning, y_tuning, test_size=0.2) # fits a basic decision tree model on the data forest = RandomForestClassifier(n_estimators=op_estims, criterion=op_crit, max_depth=op_depth) fitted = forest.fit(X_train_iter, y_train_iter) # calculates the accuracy, precision, recall, and f1-measure y_pred = fitted.predict(X_test_iter).tolist() y_true = y_test_iter # NOTE - Order in lists is 'd','s' accuracy = accuracy_score(y_true, y_pred) precisions_by_label = [ precision_score(y_true, y_pred, average='binary', pos_label='d'), precision_score(y_true, y_pred, average='binary', pos_label='s') ] precision_global = precision_score(y_true, y_pred, average='micro') recalls_by_label = recall_score(y_true, y_pred, average=None) recall_global = recall_score(y_true, y_pred, average='micro') f1s_by_label = f1_score(y_true, y_pred, average=None) f1_global = f1_score(y_true, y_pred, average='micro') print(j / 100, 'acc:', round(accuracy, 2), 'precisions_by_label', [round(x, 2) for x in precisions_by_label], 'precision_global', round(precision_global, 2), 'recalls_by_label', [round(x, 2) for x in recalls_by_label], 'recall_global', round(recall_global, 2), 'f1s_by_label', [round(x, 2) for x in f1s_by_label], 'f1_global', round(f1_global, 2)) X_tuning = X_tuning_copy # TODO - Plots for all of the metrics chosen - do it for just one model to give reader idea of the range in performance depending on selected features + how we visualized it #ax = plt.gca() #ax.plot(percs, [x[1] for x in nrmses]) #plt.xlabel('percentage of features included') #plt.ylabel('NRMSE') #plt.title('NMRSE change over feature inclusion thresholds') #plt.axis('tight') #plt.show(block=False) #plt.savefig('linear_feature_threshold_nrmses.png') #plt.clf() print('Please input a reasonable decimal threshold for feature selection:') #thresh = float(input()) thresh = 1 # uses the percent threshold to perform feature selection, applying it to training and test sets index_test = X_test.index.tolist() index_train = X_general.index.tolist() cols = X_test.columns.tolist() selector = SelectPercentile(percentile=(thresh * 100)) X_new = selector.fit_transform(X_general, y_general) temp = selector.get_support(True) X_general = pd.DataFrame(data=X_general, index=index_train, columns=[cols[i] for i in temp]) X_test = pd.DataFrame(data=X_test, index=index_test, columns=[cols[i] for i in temp]) # fits a decision tree model to the testing data forest = RandomForestClassifier(n_estimators=op_estims, criterion=op_crit, max_depth=op_depth) fitted = forest.fit(X_general, y_general) print(fitted.n_features_) # prints evaluation metrics y_pred = fitted.predict(X_test).tolist() y_true = y_test accuracy = accuracy_score(y_true, y_pred) precisions_by_label = [ precision_score(y_true, y_pred, average='binary', pos_label='d'), precision_score(y_true, y_pred, average='binary', pos_label='s') ] precision_global = precision_score(y_true, y_pred, average='micro') recalls_by_label = recall_score(y_true, y_pred, average=None) recall_global = recall_score(y_true, y_pred, average='micro') f1s_by_label = f1_score(y_true, y_pred, average=None) f1_global = f1_score(y_true, y_pred, average='micro') # TODO - Print metrics more intelligently print('FOREST:', 'acc:', round(accuracy, 2), 'precisions_by_label', [round(x, 2) for x in precisions_by_label], 'precision_global', round(precision_global, 2), 'recalls_by_label', [round(x, 2) for x in recalls_by_label], 'recall_global', round(recall_global, 2), 'f1s_by_label', [round(x, 2) for x in f1s_by_label], 'f1_global', round(f1_global, 2)) # TODO - Plot all evaluation metrics! # visualizes the residuals #plt.xlabel('Predicted Value') #plt.ylabel('Residual') #plt.title('Residuals (Linear Regression)') #plt.axis('tight') #plt.savefig('linear_residuals.png') #plt.show() #plt.clf() # saves the model to a file #filename = 'forest.sav' #pickle.dump(fitted, open(filename, 'wb')) # prints the mean accuracy #loaded_model = pickle.load(open(filename, 'rb')) #result = loaded_model.score(X_test, y_test) #print('Mean Accuracy: ', result) return fitted
#Concatenate non-categorical data and categorical X_train1 = numpy.concatenate((X_train_temp,X_train.iloc[:,10:c-1]),axis=1) X_test1 = numpy.concatenate((X_test_temp,X_test.iloc[:,10:c-1]),axis=1) scaled_features_train_df = pd.DataFrame(X_train1, index=X_train.index, columns=X_train.columns) scaled_features_test_df = pd.DataFrame(X_test1, index=X_test.index, columns=X_test.columns) #---------------------------------------------------------------- from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import f_classif # Write your solution here: skb=SelectPercentile(score_func=f_classif,percentile=20) predictors=skb.fit_transform(X_train1,Y_train) scores=predictors.tolist() #print(scores) top_k_index=skb.get_support(indices=True) print(top_k_index) top_k_predictors=predictors[top_k_index] print(top_k_predictors) #--------------------------------------------------------------- from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score clf=OneVsRestClassifier(LogisticRegression()) clf1=OneVsRestClassifier(LogisticRegression()) model_fit_all_features = clf1.fit(X_train,Y_train) predictions_all_features = clf1.predict(X_test) score_all_features = accuracy_score(Y_test,predictions_all_features)