Beispiel #1
0
def relieff(X_std_train, X_std_test, y_train, n_features, colNames, features):
    '''
    Feature selection using ReliefF.
    
    :param str X_std_train: Training data 
    :param str X_std_test: Validation data
    :param str y_train: Response to the training data
    :param int n_features: Number of features to be selected
    :param colNames: List with the names of the columns/features
    :features: List that the selected features will be added to
    :return: The training data and validation data with only the selected features
             and the list with the features
    '''
    relieff = ReliefF(n_features_to_select=n_features, n_neighbors=20)
    relieff.fit(X_std_train, y_train)
    importances = relieff.feature_importances_
    indices = np.argsort(importances)[::-1]
    feature_names = []

    for f in range(X_std_train.shape[1]):
        feature_names.append(colNames[indices[f]])
    print(feature_names[0:n_features])
    X_std_train = X_std_train[:, indices[0:n_features]]
    X_std_test = X_std_test[:, indices[0:n_features]]
    features.append(feature_names[0:n_features])
    return (X_std_train, X_std_test, features)
Beispiel #2
0
    def predict_features(self, df_features, df_target, idx=0, **kwargs):
        X = df_features.as_matrix()
        y = df_target.as_matrix()[:, 0]
        rr = ReliefF()
        rr.fit(X, y)

        return rr.feature_importances_
Beispiel #3
0
 def __init__(self, number_parent_features,  output_dimensions):
     Transformation.__init__(self, 'skrebate',
              number_parent_features, output_dimensions=output_dimensions,
              parent_feature_order_matters=False, parent_feature_repetition_is_allowed=False)
     #self.model = MultiSURF(n_features_to_select=output_dimensions)
     #self.model = SURF(n_features_to_select=output_dimensions)
     self.model = ReliefF(n_features_to_select=output_dimensions, n_neighbors=100)
Beispiel #4
0
def svm_ga(X, y, rfe=True, paramgrid=None):

    # feature selection
    fltr = RFE(ReliefF(), n_features_to_select=5,
               step=0.5) if rfe else ReliefF(n_features_to_select=5,
                                             n_neighbors=3)

    clf = SVC()

    param_grid = {
        "svc__kernel": ["rbf"],
        'svc__C': [10e-2, 10e-1, 10, 10e1, 10e2, 10e3, 10e4],
        'svc__gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1, 1.1]
    } if paramgrid is None else paramgrid

    # make pipeline
    pipe = make_pipeline(preprocessing.StandardScaler(), fltr, clf)

    from evolutionary_search import EvolutionaryAlgorithmSearchCV
    cv = EvolutionaryAlgorithmSearchCV(estimator=pipe,
                                       params=param_grid,
                                       scoring="accuracy",
                                       cv=10,
                                       verbose=1,
                                       population_size=50,
                                       gene_mutation_prob=0.1,
                                       gene_crossover_prob=0.8,
                                       tournament_size=10,
                                       generations_number=25)
    cv.fit(X, y)

    print(cv.best_params_)
    print(cv.best_score_)
def Relief(df, n, n_jobs, save_name):
    """Feature selection using Relief on the whole dataframe."""
    from skrebate import ReliefF

    X_all = df.drop('Class', axis=1).values
    Y_all = df.loc[:, 'Class'].values
    Y_all = Y_all.astype('int')

    feature_names = list(df)
    feature_names.remove('Class')
    print("=====* Running relief/rebase based feature selection *=====")

    # Set selection to relief
    fs = ReliefF(n_jobs=int(n_jobs))
    fs.fit(X_all, Y_all)
    imp = pd.DataFrame(fs.feature_importances_,
                       index=feature_names,
                       columns=['relief_imp'])
    imp_top = imp.sort_values(by='relief_imp', ascending=False)

    for n_size in n:
        keep = imp_top.index.values[0:int(n_size)]
        print("Features selected using Relief from rebase: %s" % str(keep))
        save_name2 = save_name + "_" + str(n_size)
        SaveTopFeats(keep, save_name2)
    def predict_features(self, df_features, df_target, idx=0, **kwargs):
        X = df_features.values
        y = df_target.values[:, 0]
        rr = ReliefF()
        rr.fit(X, y)

        return rr.feature_importances_
def ReliefF_Method(X, y, n):
    X = np.array(X)
    y = np.asarray(y)
    y = y[:, 0]
    clf = ReliefF(n_features_to_select=n, n_neighbors=100)
    Reresult = clf.fit_transform(X, y)
    np.savetxt("ReliefF_out.csv", Reresult, delimiter=",")
    return None
Beispiel #8
0
def relf(n_neb, n_feat, trainx, trainy, testx):
    fs = ReliefF(n_features_to_select=n_feat,
                 n_neighbors=n_neb,
                 discrete_threshold=10,
                 n_jobs=1)
    fs.fit(trainx, trainy)
    ind = fs.transform(trainx)
    return ind
Beispiel #9
0
def ReliefF_Method(X, y, n):
    X = np.array(X)
    y = np.array(y)
    y = y[:, 0]
    clf = ReliefF(n_features_to_select=n, n_neighbors=50)
    Reresult = clf.fit_transform(X, y)
    Reresult = pd.DataFrame(Reresult)
    Reresult.to_csv("ReliefF_out.csv")
    return None
Beispiel #10
0
 def test_relief(self):
     n = 10
     x = np.random.randint(n, size=(n, 6))
     y = np.random.randint(n, size=n)
     # print(y)
     print(_DefaultMeasures.reliefF_measure(x, y, 6))
     # skrebate
     R = ReliefF()
     R.fit(x, y)
     print(R.feature_importances_)
def select_relieff(X, y, percentile=10):
    unique, counts = np.unique(y, return_counts=True)
    num = math.ceil(X.shape[0] * percentile / 100)
    k = np.min(counts)
    if k > 100:
        k = 100
    selector = ReliefF(n_features_to_select=num,
                       n_neighbors=k,
                       discrete_threshold=3,
                       n_jobs=-1)
    selector.fit(X, y)
    return selector
def importance_relieff(X,
                       y,
                       n_features_to_select,
                       n_neighbors,
                       sample_rows,
                       encoder=None,
                       plot=True):
    """Utilization of the algorithm ReliefF in our dataframe

    Args:
        X (DataFrame): Independent variables
        y (Series): Dependen variable or target
        n_features_to_select (int): Number of features to be in the resulting DataFrame
        n_neighbors (int): Number of neighbors to be condered for the model
        sample_rows (int): Number of sample rows
        encoder (obj, optional): Object from the type 'ReliefF'. Defaults to None.
        plot (bool, optional): Controls to show or not the 'plot_importance'. Defaults to True.

    Returns:
        DataFrame: Same as source
    """

    sample = random.sample(list(X.index), sample_rows)
    sample_features = X.iloc[sample, :].to_numpy()
    sample_labels = y.iloc[sample].to_numpy()

    if encoder is None:
        encoder = ReliefF(n_features_to_select=n_features_to_select,
                          n_neighbors=n_neighbors)
        encoder.fit(sample_features, sample_labels)
    my_important_features = encoder.transform(sample_features)

    print("No. of tuples, No. of Columns before ReliefF : " +
          str(sample_features.shape) +
          "\nNo. of tuples, No. of Columns after ReliefF : " +
          str(my_important_features.shape))

    # Plot the importances, taken from the `encoder` variable.
    if plot:
        plot_importance(X.columns, abs(encoder.feature_importances_))

    # Get the most important column names
    my_important_features_names = [
        X.columns[i] for i in abs(encoder.top_features_)
    ]

    # Create a DataFrame
    X = pd.DataFrame(
        X,
        columns=my_important_features_names[:my_important_features.shape[1]])

    return X, encoder
Beispiel #13
0
class skrebateTransformer(BaseEstimator, TransformerMixin, Transformation):
    def __init__(self, number_parent_features,  output_dimensions):
        Transformation.__init__(self, 'skrebate',
                 number_parent_features, output_dimensions=output_dimensions,
                 parent_feature_order_matters=False, parent_feature_repetition_is_allowed=False)
        #self.model = MultiSURF(n_features_to_select=output_dimensions)
        #self.model = SURF(n_features_to_select=output_dimensions)
        self.model = ReliefF(n_features_to_select=output_dimensions, n_neighbors=100)

    def fit(self, X, y=None):
        return self.model.fit(X, y)

    def transform(self, data):
        return self.model.transform(data)
def relieff(X_std_train, X_std_test, y_train, n_features, NyNames):
    relieff = ReliefF(n_features_to_select=n_features, n_neighbors=20)
    relieff.fit(X_std_train, y_train)
    importances = relieff.feature_importances_

    indices = np.argsort(importances)[::-1]
    feature_names = []

    for f in range(X_std_train.shape[1]):
        feature_names.append(NyNames[indices[f]])
    print('Features', feature_names[0:n_features])
    X_std_train = X_std_train[:, indices[0:n_features]]
    X_std_test = X_std_test[:, indices[0:n_features]]
    return (X_std_train, X_std_test)
def relieff_fs(X_df,X_train_all,X_test_all,y_train):
    '''ReliefF for feature selection'''
    fs = ReliefF(discrete_threshold = 1000, n_jobs=1)
    fs.fit(X_train_all, y_train)

    feature_scores = fs.feature_importances_
    feature_ids = np.where(feature_scores>=0)[0]
    selected_features = np.array(X_df.columns[feature_ids])

    #New X_train and X_test matrices
    X_train = X_train_all[:,feature_ids]
    X_test = X_test_all[:,feature_ids]

    return selected_features, feature_scores, X_train, X_test
Beispiel #16
0
    def fit(self, X, y=None, **kwargs):

        X, y = self.check_X_y(X, y)
        self.check_params(X, y)

        selector = ReliefF(
            n_neighbors=self.num_neighbors,
            n_features_to_select=self.num_features,
        )
        selector.fit(X, y)

        _support = selector.top_features_[:self.num_features]
        self.support = self.check_support(_support)

        return self
Beispiel #17
0
def test_relieff_pandas_inputs():
    """Check: Data (pandas DataFrame/Series): ReliefF works with pandas DataFrame and Series inputs"""
    np.random.seed(49082)
    clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))
    assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3,
                                   n_jobs=-1)) > 0.7
Beispiel #18
0
def test_relieffpercent_pipeline():
    """Check: Data (Binary Endpoint, Discrete Features): ReliefF with % neighbors works in a sklearn pipeline"""
    np.random.seed(49082)

    clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=0.1),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features, labels, cv=3, n_jobs=-1)) > 0.7
Beispiel #19
0
def test_relieff_pipeline_mixed_attributes():
    """Check: Data (Mixed Attributes): ReliefF works in a sklearn pipeline"""
    np.random.seed(49082)

    clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features_mixed_attributes,
                                   labels_mixed_attributes, cv=3, n_jobs=-1)) > 0.7
Beispiel #20
0
def test_relieffpercent_pipeline_parallel():
    """Ensure that ReliefF with % neighbors works in a sklearn pipeline where cross_val_score is parallelized"""
    np.random.seed(49082)

    clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=0.1),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features, labels, cv=3,
                                   n_jobs=-1)) > 0.7
Beispiel #21
0
def test_relieff_pipeline_cont_endpoint():
    """Check: Data (Continuous Endpoint): ReliefF works in a sklearn pipeline"""
    np.random.seed(49082)

    clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10),
                        RandomForestRegressor(n_estimators=100, n_jobs=-1))

    assert abs(np.mean(cross_val_score(clf, features_cont_endpoint,
                                       labels_cont_endpoint, cv=3, n_jobs=-1))) < 0.5
Beispiel #22
0
def test_relieff_pipeline_parallel():
    """Check: Data (Binary Endpoint, Discrete Features): ReliefF works in a sklearn pipeline when ReliefF is parallelized"""
    # Note that the rebate algorithm cannot be parallelized with both the random forest and the cross validation all at once.  If the rebate algorithm is parallelized, the cross-validation scoring cannot be.
    np.random.seed(49082)

    clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
    def predict_features(self, df_features, df_target, idx=0, **kwargs):
        """For one variable, predict its neighbouring nodes.

        Args:
            df_features (pandas.DataFrame):
            df_target (pandas.Series):
            idx (int): (optional) for printing purposes
            kwargs (dict): additional options for algorithms

        Returns:
            list: scores of each feature relatively to the target
        """
        X = df_features.values
        y = df_target.values[:, 0]
        rr = ReliefF()
        rr.fit(X, y)

        return rr.feature_importances_
Beispiel #24
0
    def processing_relieff(df, n_components):

        features_selected = ReliefF()
        x, y = df.drop('DX', axis=1).values, df['DX'].values

        features_selected.fit(x, y)

        relief_dict = dict(
            zip(
                df.drop('DX', axis=1).columns,
                features_selected.feature_importances_))
        top_features = dict(
            sorted(relief_dict.items(), key=itemgetter(1),
                   reverse=True)[:n_components]).keys()

        top_features = list(top_features)
        if 'DX' not in top_features:
            top_features.append('DX')

        return df[top_features], top_features
Beispiel #25
0
def test_relieffpercent_pipeline_missing_values():
    """Ensure that ReliefF with % neighbors works in a sklearn pipeline with missing values"""
    np.random.seed(49082)

    clf = make_pipeline(
        ReliefF(n_features_to_select=2, n_neighbors=0.1, n_jobs=-1), Imputer(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(
        cross_val_score(
            clf, features_missing_values, labels_missing_values, cv=3)) > 0.7
Beispiel #26
0
def test_relieff_pipeline_multiclass():
    """Ensure that ReliefF works in a sklearn pipeline with a multiclass endpoint"""
    np.random.seed(49082)

    clf = make_pipeline(
        ReliefF(n_features_to_select=2, n_neighbors=10, n_jobs=-1), Imputer(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(
        cross_val_score(clf, features_multiclass, labels_multiclass,
                        cv=3)) > 0.7
Beispiel #27
0
def svm_cv(X, y, rfe=True, paramgrid=None):
    """

    :param X:
    :param y:
    :param rfe:
    :param paramgrid:
    :return:
    """
    norm = preprocessing.StandardScaler()

    # feature selection
    fltr = RFE(ReliefF(), n_features_to_select=5, step=1)

    # predictive model
    model = SVC()

    # make pipeline
    pipe = make_pipeline(norm, fltr, model)

    param_grid = {
        'svc__kernel': ['rbf'],
        'svc__C': [1, 10, 10e1, 10e2, 10e3, 10e4],
        'svc__gamma': [0.1, 0.2, 0.3, 0.4, 0.5]
    } if paramgrid is None else paramgrid

    scores = ['accuracy']

    kf = KFold(n_splits=10, shuffle=True, random_state=4)
    # kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(pipe,
                           param_grid,
                           cv=kf,
                           n_jobs=2,
                           scoring=score,
                           return_train_score=False,
                           verbose=10)
        clf.fit(X, y)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print(clf.best_score_)

        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
        print()
Beispiel #28
0
def test_relieff_pipeline_multiclass():
    """Check: Data (Multiclass Endpoint): ReliefF works in a sklearn pipeline """
    np.random.seed(49082)

    clf = make_pipeline(ReliefF(n_features_to_select=2, n_neighbors=10),
                        SimpleImputer(),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(
        cross_val_score(
            clf, features_multiclass, labels_multiclass, cv=3,
            n_jobs=-1)) > 0.7
Beispiel #29
0
    def feature_selection_relief(self,
                                 feature_train,
                                 label_train,
                                 feature_test,
                                 n_features_to_select=None):
        """
        This functio is used to select the features using relief-based feature selection algorithms
        """
        from skrebate import ReliefF

        [n_sub, n_features] = np.shape(feature_train)
        if n_features_to_select is None:
            n_features_to_select = np.int(np.round(n_features / 10))

        if isinstance(n_features_to_select, np.float):
            n_features_to_select = np.int(
                np.round(n_features * n_features_to_select))

        fs = ReliefF(n_features_to_select=n_features_to_select,
                     n_neighbors=100,
                     discrete_threshold=10,
                     verbose=True,
                     n_jobs=-1)
        fs.fit(feature_train, label_train)
        feature_train = fs.transform(feature_train)
        feature_test = fs.transform(feature_test)
        mask = fs.top_features_[:n_features_to_select]
        return feature_train, feature_test, mask, n_features
Beispiel #30
0
def test_relieff_init():
    """Check: ReliefF constructor stores custom values correctly"""
    clf = ReliefF(n_features_to_select=7,
                  n_neighbors=500,
                  discrete_threshold=20,
                  verbose=True,
                  n_jobs=3)

    assert clf.n_features_to_select == 7
    assert clf.n_neighbors == 500
    assert clf.discrete_threshold == 20
    assert clf.verbose == True
    assert clf.n_jobs == 3