def check(self, train, dropped_columns_): assert dropped_columns_.shape[ 0] == 51, "Please choose to keep 40 columns" feature_cols = train.columns.drop( ['click_time', 'attributed_time', 'is_attributed']) # Do feature extraction on the training data only! selector = SelectKBest(f_classif, k=40) X_new = selector.fit_transform(train[feature_cols], train['is_attributed']) # Get back the features we've kept, zero out all other features selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=train.index, columns=feature_cols) # Dropped columns have values of all 0s, so var is 0, drop them dropped_columns = selected_features.columns[selected_features.var() == 0] message = ( "Somethings not right with your result. Be sure to use the train " "dataset for the feature selection") assert (dropped_columns == dropped_columns_).all(), message
def features_selection(self): features_index = [] if self.features_number != 0 and self.features_number != self.dataset_features_number: selector = SelectKBest(mutual_info_classif, k=self.features_number) training_set_selected = selector.fit_transform( self.training_set[:, :self.dataset_features_number], self.ground_through) training_set_reconstr = selector.inverse_transform( training_set_selected) i0 = 0 i1 = 0 while i0 < self.features_number: if np.array_equal(training_set_selected[:, i0], training_set_reconstr[:, i1]): features_index.append(i1) i0 += 1 i1 += 1 else: if self.packets_number == 0: features_index = [ i for i in range(self.dataset_features_number) ] else: features_index = np.r_[0:self.packets_number, self.dataset_features_number / 2:self.dataset_features_number / 2 + self.packets_number] return features_index
def features_selection(self,node): features_index = [] if node.features_number != 0 and node.features_number != self.dataset_features_number: selector = SelectKBest(mutual_info_classif, k=node.features_number) training_set_selected = selector.fit_transform( node.encoder.transform(self.training_set[node.train_index, :self.dataset_features_number]), node.encoder.transform(self.ground_truth[node.train_index, node.level], dtype=int) ) training_set_reconstr = selector.inverse_transform( node.encoder.inverse_transform( training_set_selected )) i0 = 0 i1 = 0 while i0 < node.features_number: if np.array_equal(training_set_selected[:, i0], training_set_reconstr[:, i1]): features_index.append(i1) i0 += 1 i1 += 1 else: if node.packets_number == 0: features_index = [i for i in range(self.dataset_features_number)] else: features_index = np.r_[0:node.packets_number, self.dataset_features_number/2:self.dataset_features_number/2+node.packets_number] return features_index
def features_selection(self): ''' Questa funzione parte dal presupposto che features che presentano occorrenze costanti non verranno selezionate, dando un contributo nullo, e saranno sempre azzerate. ''' features_index = [] if self.features_number != 0 and self.features_number != self.dataset_features_number: # print('\n***\nFeature Selection -f'+str(self.features_number)+' -c'+self.classifier_name'\n***\n') selector = SelectKBest(mutual_info_classif, k=self.features_number) training_set_selected = selector.fit_transform( self.training_set[:, :self.dataset_features_number], self.ground_through) training_set_reconstr = selector.inverse_transform( training_set_selected) i0 = 0 i1 = 0 while i0 < self.features_number: if np.array_equal(training_set_selected[:, i0], training_set_reconstr[:, i1]): features_index.append(i1) i0 += 1 i1 += 1 else: if self.packets_number == 0: features_index = [i for i in range(self.dataset_features_number)] else: features_index = np.r_[0:self.packets_number, self.dataset_features_number / 2:self.dataset_features_number/2+self.packets_number] return features_index
def features_selection(self,node): features_index = [] if node.features_number != 0 and node.features_number != self.dataset_features_number: # print('\n***\nFeature Selection for Classifier ' + node.tag + ' Level ' + str(node.level) + '\n***\n') selector = SelectKBest(mutual_info_classif, k=node.features_number) training_set_selected = selector.fit_transform( self.training_set[node.train_index, :self.dataset_features_number], self.ground_through[node.train_index, node.level]) training_set_reconstr = selector.inverse_transform( training_set_selected) i0 = 0 i1 = 0 while i0 < node.features_number: if np.array_equal(training_set_selected[:, i0], training_set_reconstr[:, i1]): features_index.append(i1) i0 += 1 i1 += 1 else: if node.packets_number == 0: features_index = [i for i in range(self.dataset_features_number)] else: features_index = np.r_[0:node.packets_number, self.dataset_features_number/2:self.dataset_features_number/2+node.packets_number] return features_index
def feature_selection_univariate(Xtrain, ytrain, keep=5): selector = SelectKBest(mutual_info_classif, k=keep) Xtrain_new = selector.fit_transform(Xtrain, ytrain) selected_features = pd.DataFrame(selector.inverse_transform(Xtrain_new), index=Xtrain.index, columns=Xtrain.columns) selected_columns = selected_features.columns[selected_features.var() != 0] dropped_columns = selected_features.columns[selected_features.var() == 0] return selected_columns, dropped_columns
def chiSquare(train_data, train_classes, topK): vectorizer = DictVectorizer() # Fit and transform the train data. x_train = vectorizer.fit_transform(train_data) y_train = train_classes if (x_train.shape[1] < topK): topK = x_train.shape[1] selector = SelectKBest(chi2, k=topK) x_new = selector.fit_transform(x_train, y_train) return vectorizer.inverse_transform(selector.inverse_transform(x_new))
def feature_selectionKbest(data, y, num_feature): xx = data.sort_values('pid').values xx_label = y.sort_values('pid')[sep].values select = SelectKBest(f_classif, k=num_feature).fit(xx, xx_label) # select = SelectKBest(chi2, k=num_feature).fit(xx,xx_label) # select = SelectFromModel(estimator=Lasso(), threshold=-np.inf, max_features=num_feature).fit(data,y) reduced_xx = select.transform(xx) new_data = select.inverse_transform(reduced_xx) new_data = pd.DataFrame(new_data, index=data.sort_values('pid').index, columns=data.sort_values('pid').columns) # idx = select.get_support() # print(idx) # new_data = np.delete(new_data,idx,1) return new_data
def feature_selection_anova_f_value(data, features_count): """ :param data: the data set we want to check its top best features :param features_count: the number of top features we want to consider :return: the top features """ feature_cols = data.columns.drop('TARGET') selector = SelectKBest(f_classif, k=features_count) X_new = selector.fit_transform(data[feature_cols], data['TARGET']) selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=data.index, columns=feature_cols) selected_columns = selected_features.columns[selected_features.var() != 0] return selected_columns
def get_data(features=[]): if len(features) > 0: return data[features] else: # choose features selector = SelectKBest(f_classif, k) X_new = selector.fit_transform(X, y) # Get back the features we've kept, zero out all other features selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=X.index, columns=X.columns) # Dropped columns have values of all zero, so var is zero, drop them selected_columns = selected_features[selected_features.var() != 0] return selected_features[selected_columns]
def univariate_selection(X_train, y_train, features, k_feat, method): # Esta función compara la correlación que hay entre las features que se le pasan y la variable target # de forma individual (uno contra uno). Puedo seleccionar la k features que deseo mantener. Existen distintas # metricas para medir la correlacion, por defecto está la 'f_classif' que hace un f score, pero puede ser # X^2, ANOVA o mutual information score (que captura relaciones no lineales). Ojo que las tecnicas cambian # segun sea clasificacion o regresion from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif, f_regression, mutual_info_regression # Create the selector, keeping k features selector = SelectKBest(method, k=k_feat) # Use the selector to retrieve the best features X_new = selector.fit_transform(X_train[features], y_train) # Get back the kept features as a DataFrame with dropped columns as all 0s selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=X_train.index, columns=features) # Find the columns that must be kept keep_columns = selected_features.columns[selected_features.var() != 0] return keep_columns
def SelectKBestFeatures(self, k): """ k = Number of best features you want from the model """ if k > self.track: print( 'Error, Number of features to be selected are more than the number of features in the dataset' ) return selector = SelectKBest(f_classif, k=k) self.features_new = selector.fit_transform(self.features, self.target) selected_features = pd.DataFrame(selector.inverse_transform( self.features_new), index=self.train.index, columns=self.features.columns) self.k_selected_columns = selected_features.columns[ selected_features.var() != 0] self.k_selected_columns = self.train[self.k_selected_columns] print('Top {} features : \n{}'.format(k, self.k_selected_columns.columns)) return self.k_selected_columns
def Best125(): activities_labels = pd.read_csv( "/Users/user/Desktop/UCI_HAR_Dataset/activity_labels.csv", header=None) activities_labels = pd.DataFrame(activities_labels).to_numpy() activities_labels = list(activities_labels.flatten()) feature_labels = pd.read_csv( "/Users/user/Desktop/UCI_HAR_Dataset/features.csv", header=None, delim_whitespace=True) groups_at_training = pd.read_csv( "/Users/user/Desktop/UCI_HAR_Dataset/train/subject_train.txt", header=None) groups_at_training = pd.DataFrame(groups_at_training).to_numpy() groups_at_training = groups_at_training.reshape(len(groups_at_training), ) # groups_at_training = list(groups_at_training.flatten()) groups_at_testing = pd.read_csv( "/Users/user/Desktop/UCI_HAR_Dataset/test/subject_test.txt", header=None) groups_at_testing = pd.DataFrame(groups_at_testing).to_numpy() groups_at_testing = groups_at_testing.reshape(len(groups_at_testing), ) # groups_at_testing = list(groups_at_testing.flatten()) # Specify data data_train = pd.read_csv( "/Users/user/Desktop/UCI_HAR_Dataset/train/X_train.csv", delim_whitespace=True, header=None) data_test = pd.read_csv( "/Users/user/Desktop/UCI_HAR_Dataset/test/X_test.csv", delim_whitespace=True, header=None) data_train = pd.DataFrame(data_train).to_numpy() data_test = pd.DataFrame(data_test).to_numpy() output_train = pd.read_csv( "/Users/user/Desktop/UCI_HAR_Dataset/train/y_train.csv", delim_whitespace=True, header=None) output_test = pd.read_csv( "/Users/user/Desktop/UCI_HAR_Dataset/test/y_test.csv", delim_whitespace=True, header=None) output_train = pd.DataFrame(output_train).to_numpy() output_test = pd.DataFrame(output_test).to_numpy() output_train = output_train.reshape(len(groups_at_training), ) output_test = output_test.reshape(len(output_test), ) selector = SelectKBest(f_classif, k=125) data_train = selector.fit_transform(data_train, output_train) data_test = selector.transform(data_test) unscaled_data_test = data_test selected_features = pd.DataFrame(selector.inverse_transform(data_train), columns=feature_labels) #print(selected_features.head()) # Dropped columns have values of all 0s, so var is 0, drop them #selected_columns = selected_features.columns[selected_features.var() != 0] #print(selected_columns) #scaler = MinMaxScaler() #data_train= scaler.fit_transform(data_train) #data_test = scaler.transform(data_test) print("Data_train", np.shape(data_train), "Data_test", np.shape(data_test), "Output_train", np.shape(output_train), "Output_test", np.shape(output_test), "activities_labels", np.shape(activities_labels), "groups_at_training", np.shape(groups_at_training), "groups_at_testing", np.shape(groups_at_testing)) print("################################") return data_train, data_test, output_train, output_test, unscaled_data_test, activities_labels, feature_labels, groups_at_training, groups_at_testing
# we can plug them together in a *pipeline* that performs the two operations # successively: from sklearn.pipeline import Pipeline anova_svc = Pipeline([('anova', feature_selection), ('svc', clf)]) ### Fit and predict ########################################################### anova_svc.fit(X, y) y_pred = anova_svc.predict(X) ### Visualisation ############################################################# ### Look at the discriminating weights svc = clf.support_vectors_ # reverse feature selection svc = feature_selection.inverse_transform(svc) # reverse masking svc = nifti_masker.inverse_transform(svc[0]) # We use a masked array so that the voxels at '-1' are displayed # transparently act = np.ma.masked_array(svc.get_data(), svc.get_data() == 0) plot_haxby(act, 'SVC') pl.savefig('haxby_svm.pdf') pl.savefig('haxby_svm.eps') ############################################################################### # # # Searchlight # # #
# Best DNetPRO signature dnet_signature = dnet.selected_signature # print some informations print('Signature DNetPRO: {}'.format(sorted(dnet_signature))) print('DNetPRO score: {:.3f}'.format(dnet.score(X_test, y_test))) print('Informative found: {:d} / {:d}'.format( len([x for x in dnet_signature if x < Ninformative]), Ninformative)) # Compare the obtained results against the Kbest features with K=number of feature in the DNetPRO signature filter_kbest = SelectKBest(k=new_probe) # extract the filtered datasets Kbest_data = filter_kbest.fit_transform(X_train, y_train) # set to zero the other features Kbest_filtered = filter_kbest.inverse_transform(Kbest_data) # now it is easy to extract the selected features as non-zero columns Kbest_signature = set(np.nonzero(Kbest_filtered)[1]) # Just to be sure that everything goes right... if not (len(Kbest_signature) == len(dnet_signature)): raise ValueError( 'Inconsistent length of features between the two methods') # print some informations print('Signature Kbest: {}'.format(sorted(Kbest_signature))) print('Kbest score: {:.3f}'.format( classifier.fit(Kbest_data, y_train).score(filter_kbest.transform(X_test), y_test))) print('Informative found: {:d} / {:d}'.format( len([x for x in Kbest_signature if x < Ninformative]), Ninformative))
def create_coef_maps(subid, X, ev_labels, func_masker, mask_name, paths, save_img=True, show_img=False, calc_A=False, univariate_fsel_k=None): """Create coef maps, output niftis and pngs ---------- subid : str Subject ID (e.g., 'ap01') X : 2D numpy array Selected BOLD data (sample x voxel) for classification ev_labels : list/array of strings condition labels (length = # of samples) func_masker : NiftiMasker object created from mask image, use to convert 2D back to 3D paths : dict filepaths to the mean functional (to use as background for png) & output directory save_img : bool save out niftis and png images? show_img : bool show images? calc_A : bool calculate activation patterns? This is just covariance matrix of data multiplied by coefficients. Might crash computer if X is too big? http://www.sciencedirect.com/science/article/pii/S1053811913010914 univariate_fsel_k : int include univariate feature selection (using ANOVA) for k best features before training classifier """ # if using feature selection, do it now if univariate_fsel_k: fsel = SelectKBest(f_classif, k=univariate_fsel_k).fit(X, ev_labels) X = fsel.transform(X) classifier = LogisticRegression(penalty='l2', C=1.) classifier.fit(X, ev_labels) print classifier.classes_ if len(classifier.classes_) > 2: cats = classifier.classes_ else: cats = [classifier.classes_[0]] d_coef = pd.DataFrame(columns=cats) print 'Categories: ' print cats for category in cats: print category # Get coefficients for the category of interest, save in d_coef df if len(classifier.classes_) > 2: coef = classifier.coef_[classifier.classes_ == category] d_coef[category] = pd.Series(coef.squeeze()) else: coef = classifier.coef_ d_coef[category] = pd.Series(coef.squeeze()) # reverse feature selection if necessary if univariate_fsel_k: coef = fsel.inverse_transform(coef) # Transform activation patterns or coefs to native space if calc_A: #careful, might crash computer... print 'computing cov mat...' data_cov = np.cov(X.T) print 'multiplying matrices...' A_w = np.matmul(data_cov, coef.T) print 'transforming to epi space' weight_img = func_masker.inverse_transform(A_w.T) filename = "{mask_name}_activationpattern_{category}" else: weight_img = func_masker.inverse_transform(coef) filename = "{mask_name}_coef_{category}" # Save output if save_img: filepath = paths['outnifti'].format(subid=subid) if not op.exists(filepath): os.makedirs(filepath) weight_img.to_filename(op.join(filepath, filename + '.nii.gz').format(mask_name=mask_name, category=category)) plotting.plot_stat_map(weight_img, paths['meanfile'].format(subid=subid, run_id=1), title=category, output_file=op.join(filepath, filename + '.png').format(mask_name=mask_name, category=category)) elif show_img: print 'plot stat map' plotting.plot_stat_map(weight_img, paths['meanfile'].format(subid=subid, run_id=1), title=category) return d_coef
feature_cols = baseline_data.columns.drop('outcome') # Keep 5 features selector = SelectKBest(f_classif, k=5) feature_cols = baseline_data.columns.drop('outcome') train, valid, _ = get_data_splits(baseline_data) ## only with train data to avoid data-leakaage X_new = selector.fit_transform(train[feature_cols], train['outcome']) X_new ## To obtain dropped features: # Get back the features we've kept, zero out all other features selected_features = pd.DataFrame( selector.inverse_transform(X_new), index=train.index, columns=feature_cols) selected_features.head() # This returns a DataFrame with the same index and columns as the training set, # but all the dropped columns are filled with zeros. # Dropped columns have values of all 0s, so var is 0, drop them selected_columns = selected_features.columns[selected_features.var() != 0] # Get the valid dataset with the selected features. valid_new = valid[selected_columns].head() # L1 reg: from sklearn.linear_model import LogisticRegression
import pandas as pd from preprocessing import df_all """ A large number of features can lead to overfitting.Optimizing hyperparameters and training algorithms could take longer. That is why we want to pick the most relevant features from the beginning. Univariate Feature Selection: It calculates how strongly the output feature depends on each feature from the dataset using statistical tests (like χ2). Utilizing SelectKBest which has several options when it comes to used statistical tests (the default however is χ2) """ df_f_selection = pd.DataFrame() df_f_selection['Inspired'] = df_all['Inspired'] df_f_selection['Envious'] = df_all['Envious'] df_f_selection['Angry'] = df_all['Angry'] df_age = pd.DataFrame() df_age['Age'] = df_all['Age'] selector = SelectKBest(f_classif, k=3) selected_data = selector.fit_transform(df_f_selection, df_age) print(selected_data) selected_features = pd.DataFrame(selector.inverse_transform(selected_data), index=df_f_selection.index, columns=df_f_selection.columns) selected_columns = selected_features.columns print(selected_features[selected_columns].head())
# we can plug them together in a *pipeline* that performs the two operations # successively: from sklearn.pipeline import Pipeline anova_svc = Pipeline([('anova', feature_selection), ('svc', clf)]) ### Fit and predict ########################################################### anova_svc.fit(X, y) y_pred = anova_svc.predict(X) ### Visualisation ############################################################# ### Look at the discriminating weights svc = clf.support_vectors_ # reverse feature selection svc = feature_selection.inverse_transform(svc) # reverse masking niimg = nifti_masker.inverse_transform(svc[0]) # We use a masked array so that the voxels at '-1' are displayed # transparently act = np.ma.masked_array(niimg.get_data(), niimg.get_data() == 0) ### Create the figure import pylab as pl pl.axis('off') pl.title('SVM vectors') pl.imshow(np.rot90(mean_img[..., 27]), cmap=pl.cm.gray, interpolation='nearest') pl.imshow(np.rot90(act[..., 27]), cmap=pl.cm.hot, interpolation='nearest')
# We have our predictor (SVR), our feature selection (SelectKBest), and now, # we can plug them together in a *pipeline* that performs the two operations # successively: from sklearn.pipeline import Pipeline anova_svr = Pipeline([('anova', feature_selection), ('svr', svr)]) ### Fit and predict anova_svr.fit(gm_maps_masked, age) age_pred = anova_svr.predict(gm_maps_masked) ### Visualisation ### Look at the SVR's discriminating weights coef = svr.coef_ # reverse feature selection coef = feature_selection.inverse_transform(coef) # reverse masking weight_niimg = nifti_masker.inverse_transform(coef) # We use a masked array so that the voxels at '-1' are transparent weights = np.ma.masked_array(weight_niimg.get_data(), weight_niimg.get_data() == 0) ### Create the figure background_img = nibabel.load(dataset_files.gray_matter_maps[0]).get_data() picked_slice = 36 plt.figure(figsize=(5.5, 5.5)) data_for_plot = weights[:, :, picked_slice, 0] vmax = max(np.min(data_for_plot), np.max(data_for_plot)) * 0.5 plt.imshow(np.rot90(background_img[:, :, picked_slice]), cmap=plt.cm.gray, interpolation='nearest')
# apply log transformation plt.hist(np.log(ks.goal), range=(0,25), bins=50); # select the five best features based on F-value # feature selection should use training data only to avoid leakage # (1) drop the target column from the dataset # (2) split the dataset into training, validation and testing # (3) create a feature selector # (4) apply the feature selector to the training dataset # (5) get a dataframe with the same index and columns as the training data but the unselected columns are filled with zeros # (6) find selected columns by choosing features with nonzero variance feature_cols = data.columns.drop('outcome') train, valid, test = get_data_splits(data) selector = SelectKBest(f_classif, k=6) X_new = selector.fit_transform(train[feature_cols], train['outcome']) selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=train.index, columns=feature_cols) selected_columns = selected_features.columns[selected_features.var()!=0] # L1 regularization # feature selection using L1 regularization should use training data only # (1) split the data into training, validation and testing # (2) drop the target column # (3) fit a logistic regressio model to the training dataset (the smaller the parameter C the more penalty) # (4) select the nonzero coefficients using .SelectFromModel method # (5) select features based on the nonzero coefficients # (6) get a dataframe with the same index and columns as the training data but the unselected columns are filled with zeros # (7) find selected columns by choosing features with nonzero variance train, valid, test = get_data_splits(data) X, y = train[train.columns.drop("outcome")], train['outcome']
# # clf = Pipeline([ # # ('feature_selection', LinearSVC(penalty="l1",dual=False)), # # ('classification', LinearSVC(penalty='l2')) # # ]) # # clf.fit(X, y) feature_selection = SelectKBest(f_classif, k=50) # take the best 500 # to make life easier we will create a pipeline object pipe = Pipeline([('anova', feature_selection), ('svc', clf)]) # where the magic happens for ii, (train, test) in enumerate(cv): pipe.fit(X[train], y[train]) y_pred = pipe.predict(X[test]) scores[ii] = np.sum(y_pred == y[test]) / float(len(y[test])) feature_weights += feature_selection.inverse_transform(clf.coef_) print 'Average prediction accuracy: %0.3f | standard deviation: %0.3f' % \ (scores.mean(), scores.std()) # from sklearn import cross_validation # from sklearn.feature_selection import SelectPercentile, f_classif # loocv = cross_validation.LeaveOneOut(len(y)) # clf = RandomForestClassifier(n_estimators=500, max_features=np.sqrt(X.shape[1]), max_depth=None, min_samples_split=1, compute_importances=True) # top_features = 50 # # computing feature importance # acc = 0 # cnt = .0 # for train_index, test_index in loocv:
# we will use an ANOVA f-test to preselect relevant spatio-temporal units feature_selection = SelectKBest(f_classif, k=500) # take the best 500 # to make life easier we will create a pipeline object anova_svc = Pipeline([('anova', feature_selection), ('svc', clf)]) # initialize score and feature weights result arrays scores = np.zeros(n_splits) feature_weights = np.zeros([n_vertices, n_times]) # hold on, this may take a moment for ii, (train, test) in enumerate(cv): anova_svc.fit(X[train], y[train]) y_pred = anova_svc.predict(X[test]) y_test = y[test] scores[ii] = np.sum(y_pred == y_test) / float(len(y_test)) feature_weights += feature_selection.inverse_transform(clf.coef_) \ .reshape(n_vertices, n_times) print('Average prediction accuracy: %0.3f | standard deviation: %0.3f' % (scores.mean(), scores.std())) # prepare feature weights for visualization feature_weights /= (ii + 1) # create average weights # create mask to avoid division error feature_weights = np.ma.masked_array(feature_weights, feature_weights == 0) # normalize scores for visualization purposes feature_weights /= feature_weights.std(axis=1)[:, None] feature_weights -= feature_weights.mean(axis=1)[:, None] # unmask, take absolute values, emulate f-value scale feature_weights = np.abs(feature_weights.data) * 10
if y[i] == 0: plt.scatter(X1[i, 0], X1[i, 1], color='r') elif y[i] == 1: plt.scatter(X1[i, 0], X1[i, 1], color='b') plt.title('visualization of data in 2D ( )-> tic-tac toe dataset') plt.show() ##################### r1=[] x1=[] e1=[] for i in range(1,10,2): sk = SelectKBest(chi2, k=i) X1 = sk.fit_transform(X, y) e1.append(np.mean(chi2(X1,y))) X2=sk.inverse_transform(X1) rmse = np.sqrt(mean_squared_error(X, X2)) x1.append(i) r1.append(rmse) r1=np.array(r1) x1=np.array(x1) e1=np.array(e1) print(e1) print(r1) plt.figure() plt.subplot(2,1,1) plt.plot(x1,e1) plt.xlabel('number of components') plt.ylabel('chi squared score')
For each feature, measure how strongly the target depends on the feature using a statistical test like χ2 or ANOVA. From the scikit-learn feature selection module, feature_selection.SelectKBest returns the K best features given some scoring function. For our classification problem, the module provides three different scoring functions: χ2 , ANOVA F-value, and the mutual information score. The F-value measures the linear dependency between the feature variable and the target. This means the score might underestimate the relation between a feature and the target if the relationship is nonlinear. The mutual information score is nonparametric and so can capture nonlinear relationships. from sklearn.feature_selection import SelectKBest, f_classif feature_cols = baseline_data.columns.drop('outcome') train, valid, _ = get_data_splits(baseline_data) # Keep 5 features selector = SelectKBest(f_classif, k=5) X_new = selector.fit_transform(train[feature_cols], train['outcome']) # Get back the features we've kept, zero out all other features selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=train.index, columns=feature_cols) # Dropped columns have values of all 0s, so var is 0, drop them selected_columns = selected_features.columns[selected_features.var() != 0] ================================================================================= L1 regularization Univariate methods consider only one feature at a time when making a selection decision. Instead, we can make our selection using all of the features by including them in a linear model with L1 regularization. This type of regularization (sometimes called Lasso) penalizes the absolute magnitude of the coefficients, as compared to L2 (Ridge) regression which penalizes the square of the coefficients. As the strength of regularization is increased, features which are less important for predicting the target are set to 0. This allows us to perform feature selection by adjusting the regularization parameter. We choose the parameter by finding the best performance on a hold-out set, or decide ahead of time how many features to keep. from sklearn.linear_model import LogisticRegression
# # clf = Pipeline([ # # ('feature_selection', LinearSVC(penalty="l1",dual=False)), # # ('classification', LinearSVC(penalty='l2')) # # ]) # # clf.fit(X, y) feature_selection = SelectKBest(f_classif, k=50) # take the best 500 # to make life easier we will create a pipeline object pipe = Pipeline([("anova", feature_selection), ("svc", clf)]) # where the magic happens for ii, (train, test) in enumerate(cv): pipe.fit(X[train], y[train]) y_pred = pipe.predict(X[test]) scores[ii] = np.sum(y_pred == y[test]) / float(len(y[test])) feature_weights += feature_selection.inverse_transform(clf.coef_) print "Average prediction accuracy: %0.3f | standard deviation: %0.3f" % (scores.mean(), scores.std()) # from sklearn import cross_validation # from sklearn.feature_selection import SelectPercentile, f_classif # loocv = cross_validation.LeaveOneOut(len(y)) # clf = RandomForestClassifier(n_estimators=500, max_features=np.sqrt(X.shape[1]), max_depth=None, min_samples_split=1, compute_importances=True) # top_features = 50 # # computing feature importance # acc = 0 # cnt = .0 # for train_index, test_index in loocv: # selector = SelectPercentile(f_classif, percentile=10)
#get the selected/most important features and extract from validation set selected_feats = pd.DataFrame(svc_mod.inverse_transform(x_train_new), index=x_train.index, columns=x_train.columns) selected_cols = selected_feats.columns[selected_feats.var() != 0] x_devel_new = x_devel[selected_cols] #now train and test a decision tree using these selected features print('L1 regularization:') train_eval_tree(x_train_new, y_train, x_devel_new, y_devel) #METHOD 2: SelectKBest using the f_classif score select_feats = SelectKBest(f_classif, k=10) x_train_new = select_feats.fit_transform(x_train, y_train) selected_feats = pd.DataFrame(select_feats.inverse_transform(x_train_new), index=x_train.index, columns=x_train.columns) selected_cols = selected_feats.columns[selected_feats.var() != 0] x_devel_new = x_devel[selected_cols] print('Univariate feature selection (f_classif):') train_eval_tree(x_train_new, y_train, x_devel_new, y_devel) #METHOD 3: RANDOM FOREST forest = RandomForestClassifier(n_estimators=1000, random_state=0) forest.fit(x_train, y_train) selector = SelectFromModel(forest, threshold=0.10) selector.fit(x_train, y_train) for important_feats in selector.get_support(indices=True): print(x_train.columns[important_feats]) x_train_new = selector.transform(x_train)
# predict samples' classes for TRAINING dataset y_pred = anova_svc.predict(X) precision_X = precision_score(y, y_pred) print('train dataset precision: %.2f' % (precision_X)) # predict samples' classes for TESTING dataset y_pred_t = anova_svc.predict(X_t) precision_X_t = precision_score(y_t, y_pred_t) print('test dataset precision: %.2f' % (precision_X_t)) # ### Visualisation (SVC) ##################################################### import numpy as np # ### Look at the discriminating weights coef = clf.coef_ # reverse feature selection coef = feature_selection.inverse_transform(coef) # reverse masking coef = masking.unmask(coef[0], mask) # # We use a masked array so that the voxels at '-1' are displayed # # transparently act = np.ma.masked_array(coef, coef == 0) plot_haxby(act, bg_img, 'SVC', slice=29) # save statistical map as nifti image img = nibabel.Nifti1Image(act, np.eye(4)) img.to_filename('output_stats_svc.nii.gz')
def lesson_4(): print_("Lesson 4: Feature Selection", 0, 1) ks = pd.read_csv(ks_projects_file_path, parse_dates=['deadline', 'launched']) # Drop live projects ks = ks.query('state != "live"') # Add outcome column, "successful" == 1, others are 0 ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int)) # Timestamp features ks = ks.assign(hour=ks.launched.dt.hour, day=ks.launched.dt.day, month=ks.launched.dt.month, year=ks.launched.dt.year) # Label encoding cat_features = ['category', 'currency', 'country'] encoder = LabelEncoder() encoded = ks[cat_features].apply(encoder.fit_transform) data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome'] baseline_data = ks[data_cols].join(encoded) cat_features = ['category', 'currency', 'country'] interactions = pd.DataFrame(index=ks.index) for col1, col2 in itertools.combinations(cat_features, 2): new_col_name = '_'.join([col1, col2]) # Convert to strings and combine new_values = ks[col1].map(str) + "_" + ks[col2].map(str) label_enc = LabelEncoder() interactions[new_col_name] = label_enc.fit_transform(new_values) baseline_data = baseline_data.join(interactions) launched = pd.Series(ks.index, index=ks.launched, name="count_7_days").sort_index() count_7_days = launched.rolling('7d').count() - 1 count_7_days.index = launched.values count_7_days = count_7_days.reindex(ks.index) baseline_data = baseline_data.join(count_7_days) def time_since_last_project(series): # Return the time in hours return series.diff().dt.total_seconds() / 3600. df = ks[['category', 'launched']].sort_values('launched') timedeltas = df.groupby('category').transform(time_since_last_project) timedeltas = timedeltas.fillna(timedeltas.max()) baseline_data = baseline_data.join( timedeltas.rename({'launched': 'time_since_last_project'}, axis=1)) def get_data_splits(dataframe, valid_fraction=0.1): valid_fraction = 0.1 valid_size = int(len(dataframe) * valid_fraction) train = dataframe[:-valid_size * 2] # valid size == test size, last two sections of the data valid = dataframe[-valid_size * 2:-valid_size] test = dataframe[-valid_size:] return train, valid, test def train_model(train, valid): feature_cols = train.columns.drop('outcome') dtrain = lgb.Dataset(train[feature_cols], label=train['outcome']) dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome']) param = { 'num_leaves': 64, 'objective': 'binary', 'metric': 'auc', 'seed': 7 } print("Training model!") bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False) valid_pred = bst.predict(valid[feature_cols]) valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred) print(f"Validation AUC score: {valid_score:.4f}") return bst # ---------------------------- # Univariate Feature Selection # ---------------------------- feature_cols = baseline_data.columns.drop('outcome') # Keep 5 features selector = SelectKBest(f_classif, k=5) # NOTE: we should select features using only a training set, not the whole # dataset we are doing here (which will be fixed next) X_new = selector.fit_transform(baseline_data[feature_cols], baseline_data['outcome']) print_("X_new (after selecting 5 best features)", 0) print_(X_new) # Fix: select features using only a training set feature_cols = baseline_data.columns.drop('outcome') train, valid, _ = get_data_splits(baseline_data) # Keep 5 features selector = SelectKBest(f_classif, k=5) X_new = selector.fit_transform(train[feature_cols], train['outcome']) print_("X_new FIXED [Using Train Only]", 0) print_(X_new) # Get back the features we've kept, zero out all other features selected_features = pd.DataFrame(selector.inverse_transform(X_new), index=train.index, columns=feature_cols) print_( "First 5 rows from the train set including the 5 best features only (others set at 0)", 0) print_(selected_features.head()) # Dropped columns have values of all 0s, so var is 0, drop them selected_columns = selected_features.columns[selected_features.var() != 0] # Get the valid dataset with the selected features. print_("Valid dataset with the selected features only", 0) print_(valid[selected_columns].head()) # ----------------- # L1 regularization # ----------------- train, valid, _ = get_data_splits(baseline_data) X, y = train[train.columns.drop("outcome")], train['outcome'] # Set the regularization parameter C=1 logistic = LogisticRegression(C=1, penalty="l1", solver='liblinear', random_state=7).fit(X, y) model = SelectFromModel(logistic, prefit=True) X_new = model.transform(X) print_("X_new with L1 regularization", 0) print_(X_new) # Get back the kept features as a DataFrame with dropped columns as all 0s selected_features = pd.DataFrame(model.inverse_transform(X_new), index=X.index, columns=X.columns) # Dropped columns have values of all 0s, keep other columns selected_columns = selected_features.columns[selected_features.var() != 0] print_("Rejected columns: {}".format( selected_features.columns.difference(selected_columns).to_list())) # Get the valid dataset with the selected features. print_("Valid dataset with the selected features using L1 regularization", 0) print_(valid[selected_columns].head())