def relieff(X_std_train, X_std_test, y_train, n_features, colNames, features): ''' Feature selection using ReliefF. :param str X_std_train: Training data :param str X_std_test: Validation data :param str y_train: Response to the training data :param int n_features: Number of features to be selected :param colNames: List with the names of the columns/features :features: List that the selected features will be added to :return: The training data and validation data with only the selected features and the list with the features ''' relieff = ReliefF(n_features_to_select=n_features, n_neighbors=20) relieff.fit(X_std_train, y_train) importances = relieff.feature_importances_ indices = np.argsort(importances)[::-1] feature_names = [] for f in range(X_std_train.shape[1]): feature_names.append(colNames[indices[f]]) print(feature_names[0:n_features]) X_std_train = X_std_train[:, indices[0:n_features]] X_std_test = X_std_test[:, indices[0:n_features]] features.append(feature_names[0:n_features]) return (X_std_train, X_std_test, features)
def predict_features(self, df_features, df_target, idx=0, **kwargs): X = df_features.as_matrix() y = df_target.as_matrix()[:, 0] rr = ReliefF() rr.fit(X, y) return rr.feature_importances_
def __init__(self, number_parent_features, output_dimensions): Transformation.__init__(self, 'skrebate', number_parent_features, output_dimensions=output_dimensions, parent_feature_order_matters=False, parent_feature_repetition_is_allowed=False) #self.model = MultiSURF(n_features_to_select=output_dimensions) #self.model = SURF(n_features_to_select=output_dimensions) self.model = ReliefF(n_features_to_select=output_dimensions, n_neighbors=100)
def svm_ga(X, y, rfe=True, paramgrid=None): # feature selection fltr = RFE(ReliefF(), n_features_to_select=5, step=0.5) if rfe else ReliefF(n_features_to_select=5, n_neighbors=3) clf = SVC() param_grid = { "svc__kernel": ["rbf"], 'svc__C': [10e-2, 10e-1, 10, 10e1, 10e2, 10e3, 10e4], 'svc__gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1, 1.1] } if paramgrid is None else paramgrid # make pipeline pipe = make_pipeline(preprocessing.StandardScaler(), fltr, clf) from evolutionary_search import EvolutionaryAlgorithmSearchCV cv = EvolutionaryAlgorithmSearchCV(estimator=pipe, params=param_grid, scoring="accuracy", cv=10, verbose=1, population_size=50, gene_mutation_prob=0.1, gene_crossover_prob=0.8, tournament_size=10, generations_number=25) cv.fit(X, y) print(cv.best_params_) print(cv.best_score_)
def Relief(df, n, n_jobs, save_name): """Feature selection using Relief on the whole dataframe.""" from skrebate import ReliefF X_all = df.drop('Class', axis=1).values Y_all = df.loc[:, 'Class'].values Y_all = Y_all.astype('int') feature_names = list(df) feature_names.remove('Class') print("=====* Running relief/rebase based feature selection *=====") # Set selection to relief fs = ReliefF(n_jobs=int(n_jobs)) fs.fit(X_all, Y_all) imp = pd.DataFrame(fs.feature_importances_, index=feature_names, columns=['relief_imp']) imp_top = imp.sort_values(by='relief_imp', ascending=False) for n_size in n: keep = imp_top.index.values[0:int(n_size)] print("Features selected using Relief from rebase: %s" % str(keep)) save_name2 = save_name + "_" + str(n_size) SaveTopFeats(keep, save_name2)
def predict_features(self, df_features, df_target, idx=0, **kwargs): X = df_features.values y = df_target.values[:, 0] rr = ReliefF() rr.fit(X, y) return rr.feature_importances_
def ReliefF_Method(X, y, n): X = np.array(X) y = np.asarray(y) y = y[:, 0] clf = ReliefF(n_features_to_select=n, n_neighbors=100) Reresult = clf.fit_transform(X, y) np.savetxt("ReliefF_out.csv", Reresult, delimiter=",") return None
def relf(n_neb, n_feat, trainx, trainy, testx): fs = ReliefF(n_features_to_select=n_feat, n_neighbors=n_neb, discrete_threshold=10, n_jobs=1) fs.fit(trainx, trainy) ind = fs.transform(trainx) return ind
def ReliefF_Method(X, y, n): X = np.array(X) y = np.array(y) y = y[:, 0] clf = ReliefF(n_features_to_select=n, n_neighbors=50) Reresult = clf.fit_transform(X, y) Reresult = pd.DataFrame(Reresult) Reresult.to_csv("ReliefF_out.csv") return None
def test_relief(self): n = 10 x = np.random.randint(n, size=(n, 6)) y = np.random.randint(n, size=n) # print(y) print(_DefaultMeasures.reliefF_measure(x, y, 6)) # skrebate R = ReliefF() R.fit(x, y) print(R.feature_importances_)
def select_relieff(X, y, percentile=10): unique, counts = np.unique(y, return_counts=True) num = math.ceil(X.shape[0] * percentile / 100) k = np.min(counts) if k > 100: k = 100 selector = ReliefF(n_features_to_select=num, n_neighbors=k, discrete_threshold=3, n_jobs=-1) selector.fit(X, y) return selector
def importance_relieff(X, y, n_features_to_select, n_neighbors, sample_rows, encoder=None, plot=True): """Utilization of the algorithm ReliefF in our dataframe Args: X (DataFrame): Independent variables y (Series): Dependen variable or target n_features_to_select (int): Number of features to be in the resulting DataFrame n_neighbors (int): Number of neighbors to be condered for the model sample_rows (int): Number of sample rows encoder (obj, optional): Object from the type 'ReliefF'. Defaults to None. plot (bool, optional): Controls to show or not the 'plot_importance'. Defaults to True. Returns: DataFrame: Same as source """ sample = random.sample(list(X.index), sample_rows) sample_features = X.iloc[sample, :].to_numpy() sample_labels = y.iloc[sample].to_numpy() if encoder is None: encoder = ReliefF(n_features_to_select=n_features_to_select, n_neighbors=n_neighbors) encoder.fit(sample_features, sample_labels) my_important_features = encoder.transform(sample_features) print("No. of tuples, No. of Columns before ReliefF : " + str(sample_features.shape) + "\nNo. of tuples, No. of Columns after ReliefF : " + str(my_important_features.shape)) # Plot the importances, taken from the `encoder` variable. if plot: plot_importance(X.columns, abs(encoder.feature_importances_)) # Get the most important column names my_important_features_names = [ X.columns[i] for i in abs(encoder.top_features_) ] # Create a DataFrame X = pd.DataFrame( X, columns=my_important_features_names[:my_important_features.shape[1]]) return X, encoder
class skrebateTransformer(BaseEstimator, TransformerMixin, Transformation): def __init__(self, number_parent_features, output_dimensions): Transformation.__init__(self, 'skrebate', number_parent_features, output_dimensions=output_dimensions, parent_feature_order_matters=False, parent_feature_repetition_is_allowed=False) #self.model = MultiSURF(n_features_to_select=output_dimensions) #self.model = SURF(n_features_to_select=output_dimensions) self.model = ReliefF(n_features_to_select=output_dimensions, n_neighbors=100) def fit(self, X, y=None): return self.model.fit(X, y) def transform(self, data): return self.model.transform(data)
def relieff(X_std_train, X_std_test, y_train, n_features, NyNames): relieff = ReliefF(n_features_to_select=n_features, n_neighbors=20) relieff.fit(X_std_train, y_train) importances = relieff.feature_importances_ indices = np.argsort(importances)[::-1] feature_names = [] for f in range(X_std_train.shape[1]): feature_names.append(NyNames[indices[f]]) print('Features', feature_names[0:n_features]) X_std_train = X_std_train[:, indices[0:n_features]] X_std_test = X_std_test[:, indices[0:n_features]] return (X_std_train, X_std_test)
def relieff_fs(X_df,X_train_all,X_test_all,y_train): '''ReliefF for feature selection''' fs = ReliefF(discrete_threshold = 1000, n_jobs=1) fs.fit(X_train_all, y_train) feature_scores = fs.feature_importances_ feature_ids = np.where(feature_scores>=0)[0] selected_features = np.array(X_df.columns[feature_ids]) #New X_train and X_test matrices X_train = X_train_all[:,feature_ids] X_test = X_test_all[:,feature_ids] return selected_features, feature_scores, X_train, X_test
def fit(self, X, y=None, **kwargs): X, y = self.check_X_y(X, y) self.check_params(X, y) selector = ReliefF( n_neighbors=self.num_neighbors, n_features_to_select=self.num_features, ) selector.fit(X, y) _support = selector.top_features_[:self.num_features] self.support = self.check_support(_support) return self
def test_relieff_pandas_inputs(): """Check: Data (pandas DataFrame/Series): ReliefF works with pandas DataFrame and Series inputs""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3, n_jobs=-1)) > 0.7
def test_relieffpercent_pipeline(): """Check: Data (Binary Endpoint, Discrete Features): ReliefF with % neighbors works in a sklearn pipeline""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=0.1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3, n_jobs=-1)) > 0.7
def test_relieff_pipeline_mixed_attributes(): """Check: Data (Mixed Attributes): ReliefF works in a sklearn pipeline""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features_mixed_attributes, labels_mixed_attributes, cv=3, n_jobs=-1)) > 0.7
def test_relieffpercent_pipeline_parallel(): """Ensure that ReliefF with % neighbors works in a sklearn pipeline where cross_val_score is parallelized""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=0.1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3, n_jobs=-1)) > 0.7
def test_relieff_pipeline_cont_endpoint(): """Check: Data (Continuous Endpoint): ReliefF works in a sklearn pipeline""" np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10), RandomForestRegressor(n_estimators=100, n_jobs=-1)) assert abs(np.mean(cross_val_score(clf, features_cont_endpoint, labels_cont_endpoint, cv=3, n_jobs=-1))) < 0.5
def test_relieff_pipeline_parallel(): """Check: Data (Binary Endpoint, Discrete Features): ReliefF works in a sklearn pipeline when ReliefF is parallelized""" # Note that the rebate algorithm cannot be parallelized with both the random forest and the cross validation all at once. If the rebate algorithm is parallelized, the cross-validation scoring cannot be. np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
def predict_features(self, df_features, df_target, idx=0, **kwargs): """For one variable, predict its neighbouring nodes. Args: df_features (pandas.DataFrame): df_target (pandas.Series): idx (int): (optional) for printing purposes kwargs (dict): additional options for algorithms Returns: list: scores of each feature relatively to the target """ X = df_features.values y = df_target.values[:, 0] rr = ReliefF() rr.fit(X, y) return rr.feature_importances_
def processing_relieff(df, n_components): features_selected = ReliefF() x, y = df.drop('DX', axis=1).values, df['DX'].values features_selected.fit(x, y) relief_dict = dict( zip( df.drop('DX', axis=1).columns, features_selected.feature_importances_)) top_features = dict( sorted(relief_dict.items(), key=itemgetter(1), reverse=True)[:n_components]).keys() top_features = list(top_features) if 'DX' not in top_features: top_features.append('DX') return df[top_features], top_features
def test_relieffpercent_pipeline_missing_values(): """Ensure that ReliefF with % neighbors works in a sklearn pipeline with missing values""" np.random.seed(49082) clf = make_pipeline( ReliefF(n_features_to_select=2, n_neighbors=0.1, n_jobs=-1), Imputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean( cross_val_score( clf, features_missing_values, labels_missing_values, cv=3)) > 0.7
def test_relieff_pipeline_multiclass(): """Ensure that ReliefF works in a sklearn pipeline with a multiclass endpoint""" np.random.seed(49082) clf = make_pipeline( ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1), Imputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean( cross_val_score(clf, features_multiclass, labels_multiclass, cv=3)) > 0.7
def svm_cv(X, y, rfe=True, paramgrid=None): """ :param X: :param y: :param rfe: :param paramgrid: :return: """ norm = preprocessing.StandardScaler() # feature selection fltr = RFE(ReliefF(), n_features_to_select=5, step=1) # predictive model model = SVC() # make pipeline pipe = make_pipeline(norm, fltr, model) param_grid = { 'svc__kernel': ['rbf'], 'svc__C': [1, 10, 10e1, 10e2, 10e3, 10e4], 'svc__gamma': [0.1, 0.2, 0.3, 0.4, 0.5] } if paramgrid is None else paramgrid scores = ['accuracy'] kf = KFold(n_splits=10, shuffle=True, random_state=4) # kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2) for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(pipe, param_grid, cv=kf, n_jobs=2, scoring=score, return_train_score=False, verbose=10) clf.fit(X, y) print("Best parameters set found on development set:") print() print(clf.best_params_) print(clf.best_score_) means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print()
def test_relieff_pipeline_multiclass(): """Check: Data (Multiclass Endpoint): ReliefF works in a sklearn pipeline """ np.random.seed(49082) clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10), SimpleImputer(), RandomForestClassifier(n_estimators=100, n_jobs=-1)) assert np.mean( cross_val_score( clf, features_multiclass, labels_multiclass, cv=3, n_jobs=-1)) > 0.7
def feature_selection_relief(self, feature_train, label_train, feature_test, n_features_to_select=None): """ This functio is used to select the features using relief-based feature selection algorithms """ from skrebate import ReliefF [n_sub, n_features] = np.shape(feature_train) if n_features_to_select is None: n_features_to_select = np.int(np.round(n_features / 10)) if isinstance(n_features_to_select, np.float): n_features_to_select = np.int( np.round(n_features * n_features_to_select)) fs = ReliefF(n_features_to_select=n_features_to_select, n_neighbors=100, discrete_threshold=10, verbose=True, n_jobs=-1) fs.fit(feature_train, label_train) feature_train = fs.transform(feature_train) feature_test = fs.transform(feature_test) mask = fs.top_features_[:n_features_to_select] return feature_train, feature_test, mask, n_features
def test_relieff_init(): """Check: ReliefF constructor stores custom values correctly""" clf = ReliefF(n_features_to_select=7, n_neighbors=500, discrete_threshold=20, verbose=True, n_jobs=3) assert clf.n_features_to_select == 7 assert clf.n_neighbors == 500 assert clf.discrete_threshold == 20 assert clf.verbose == True assert clf.n_jobs == 3