def feature_selection_k(k, X, y, model_name=None, dataset_name=None): if model_name == "RFE": model = RFE(n_features_to_select=k, estimator=DecisionTreeClassifier()) elif model_name == "FSFS": model = SequentialFeatureSelector(n_features_to_select=k, estimator=DecisionTreeClassifier(), direction="forward") start = time.process_time() X_trans = model.fit_transform(X, y) end = time.process_time() runtime = end - start X_re = model.inverse_transform(X_trans) error = ((X_re - X)**2).mean().mean() return { "dataset": dataset_name, "model": model_name, "k": k, "runtime": runtime, "reconstruction_error": error }, model
def rfe(dataset, n_components, save_to_file=False): if dataset == 'creditcard': X, y = load_data.load_creditcard_data() else: X, y = load_data.load_cancer_data() estimator = SVR(kernel="linear") model = RFE(estimator, n_features_to_select=n_components, step=1) model = model.fit(X, y) n_samples = X.shape[0] X_fitted = model.transform(X) kurt = pd.DataFrame(X_fitted) kurt = kurt.kurt(axis=0) kurt = kurt.abs().mean() X_inverse = model.inverse_transform(X_fitted) reconstruction_error = np.linalg.norm(X - X_inverse) / n_samples if save_to_file: X_fitted = pd.DataFrame(X_fitted) X_fitted['label'] = y.values X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv') return kurt, reconstruction_error
def myFS(data, act_labels, output_folder, experiment_name, avg='binary'): # Split data, act_labels into train, test sets X_train, X_test, y_train, y_test = train_test_split(data, act_labels, test_size=0.1, random_state=13) # Calculate test recall when running multiple iterations for k = 1 to num_features num_features = data.shape[1] num_features = 25 values = list(range(1, num_features + 1)) rn = np.random.RandomState(13) random_seeds = list(rn.randint(1, 1000000, 1)) recalls = [] for r in random_seeds: recall_temp = [] for k in values: estimator = DecisionTreeClassifier(random_state=r) fs = RFE(estimator, n_features_to_select=k, step=1).fit(X_train, y_train) y_pred = fs.predict(X_test) rec = recall_score(y_test, y_pred, average=avg) recall_temp.append(rec) recalls.append(recall_temp) avg_recall = np.mean(np.array(recalls), axis=0) recall_std = np.std(np.array(recalls), axis=0) # Plot the average recall for each k and include the error bars plt.errorbar(list(range(1, num_features + 1)), avg_recall, recall_std) plt.xticks(ticks=list(range(1, num_features + 1)), labels=list(range(1, num_features + 1))) plt.xlabel('# Components') plt.ylabel('Recall Score') plt.title('Average Recall Score for K Components Over 20 Iterations') plt.savefig(output_folder + '/' + experiment_name + '_fs_component_recall_score.png') plt.close() plt.figure() # Plot the reconstruction errors errors = [] for r in random_seeds: mses = [] for k in values: estimator = DecisionTreeClassifier(random_state=r) fs = RFE(estimator, n_features_to_select=k, step=1).fit(X_train, y_train) trans_data = fs.transform(X_train) rec_data = fs.inverse_transform(trans_data) mse = MSE(rec_data, X_train.values) mses.append(mse) errors.append(mses) avg_errors = np.mean(np.array(errors), axis=0) std_errors = np.std(np.array(errors), axis=0) plt.errorbar(list(range(1, num_features + 1)), avg_errors, std_errors) plt.xticks(ticks=list(range(num_features)), labels=list(range(1, num_features + 1))) plt.xlabel('# Components') plt.ylabel('Reconstruction Error') plt.title('Average Reconstruction Error for K Components') plt.savefig(output_folder + '/' + experiment_name + '_fs_component_reconstruction_error.png') plt.close() plt.figure() # Run one more time with the optimal k value found above votes = [ np.argmax(avg_recall), np.argmax(avg_recall + recall_std), np.argmax(avg_recall - recall_std) ] results = np.zeros(num_features) for v in votes: results[v] = results[v] + 1 k = np.argmax(results) estimator = DecisionTreeClassifier(random_state=13) start_time = time.time() fs = RFE(estimator, n_features_to_select=k, step=1).fit(X_train, y_train) end_time = time.time() final_time = end_time - start_time return fs, final_time
print('\n') X = df.iloc[:, 0:-1].values y = df.iloc[:, -1].values regr = linear_model.LinearRegression() estimator = linear_model.LinearRegression() #feature ranking with recursive feature elemination selector = RFE(estimator, 4) #4 is the number of features to select selector.fit(X, y) print(selector.n_features_) print('\n') print(selector.support_) print('\n') print(selector.ranking_) print('\n') p = selector.transform(X) q = selector.inverse_transform(p) l = [] for i in range(len(df.columns) - 1): l.append((selector.ranking_[i], df.columns[i])) print(l) print('\n') l.sort() print(l) print('\n') for i in range(4): print(l[i][1]) print('\n') #Using SelectKBest print("Using SelectKBest:") import numpy from sklearn.feature_selection import SelectKBest