Example #1
0
from sklearn.ensemble import RandomForestClassifier

#######

#we want 10 feature
sfs = SequentialFeatureSelector(RandomForestClassifier(n_estimators=10,
                                                       n_jobs=-1),
                                k_features=10,
                                forward=True,
                                floating=False,
                                verbose=2,
                                scoring='accuracy',
                                cv=3)

X = sfs.fit_transform(X, y)
"""
sfs=SequentialFeatureSelector(DecisionTreeClassifier(),k_features=10,
                                                     forward=True,floating=False,verbose=2,scoring='accuracy',cv=3)

sfs=sfs.fit(X,y)

X=sfs.fit_transform(X,y)
"""

#####################################################################

### Machine l. modelini eğitmek için x_train,y_train olarak ayırıyoruz
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,
Example #2
0
def sequential(X, y, *, estimator, direction='forward', n_features=10, cv=0):
    """Sequential feature selection.

    Sequential feature selection algorithms are a family of greedy search 
    algorithms that are used to reduce an initial d-dimensional feature space 
    to a k-dimensional feature subspace where k < d. These algorithms remove or 
    add one feature at a time based on the classifier performance until a 
    feature subset of the desired size k is reached.
    
    Parameters
    ----------

    X : ndarray of shape (n_samples, n_features_pre) 
        Feature matrix.

    y : labels, ndarray of shape (n_samples,)
        Response variables.
    
    estimator : object
        Classifier - must include coef_ or feature_importances_ attribute.
        
    direction : string, default='forward'
        Direction of sequential model, can be 'forward' or 'backward'.
    
    n_features : int, default=None
        Number of features to select.
        
    cv : int, default=0
        Number of cross-validation steps.

    Returns
    -------

    arr :  ndarray of shape (n_samples, n_features)
        Array containing features selected by the sequential models.

    Examples
    --------

    >>> import numpy as np
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from protlearn.features import aac, aaindex1, ngram
    >>> from protlearn.dimreduction import sequential
    >>> seqs = ['ARKLY', 'EERKPGL', 'PGPGEERNLY']
    >>> labels = [1., 0., 0.]
    >>> comp, _ = aac(seqs)
    >>> aaind, _ = aaindex1(seqs)
    >>> ng, _ = ngram(seqs)
    >>> features = np.concatenate([comp, aaind, ng], axis=1)
    >>> features.shape
    (3, 575)
    >>> rf = RandomForestClassifier()
    >>> reduced = sequential(features, labels, rf, n_features=10)
    >>> reduced.shape
    (3, 10)
    
    """

    if direction == 'forward':
        method = True
    elif direction == 'backward':
        method = False

    mdl = SequentialFeatureSelector(estimator,
                                    k_features=n_features,
                                    forward=method,
                                    floating=False,
                                    verbose=0,
                                    scoring='accuracy',
                                    cv=cv)

    arr = mdl.fit_transform(X, y)

    return arr
Example #3
0
def applyFeatureSelection(X, y, algorithm, n_components, mode, merged=False):
    if merged:
        newX = np.reshape(X, (-1, X.shape[3]))
        newY = np.reshape(y, y.shape[0] * y.shape[1] * y.shape[2])
    else:
        newX = np.reshape(X, (-1, X.shape[2]))
        newY = np.reshape(y, y.shape[0] * y.shape[1])
    feature = None
    # different ways to select features with wrapper methods: https://stackabuse.com/applying-wrapper-methods-in-python-for-feature-selection/
    # RFE vs. SFS: https://stackoverflow.com/questions/35640168/wrapper-methods-for-feature-selection-machine-learning-in-scikit-learn
    if mode == "forward":
        #todo"""takes forever, maybe change params like n_jobs"""
        from mlxtend.feature_selection import SequentialFeatureSelector
        feature = SequentialFeatureSelector(algorithm,
                                            k_features=n_components,
                                            forward=True)
    elif mode == "backward_threshold" or mode == 'mixed':
        from sklearn.feature_selection import SelectFromModel
        feature = SelectFromModel(algorithm, max_features=n_components)
    elif mode == "backward_iterate":
        """legacy option, way slower than with a threshold"""
        from sklearn.feature_selection import RFE
        feature = RFE(algorithm, n_features_to_select=n_components)
    else:
        raise ValueError("Unknown feature selection mode " + mode)

    new_X = feature.fit_transform(newX, newY)

    if mode == 'mixed':
        formerX = newX.transpose()
        new_X = new_X.transpose()
        # remove features already present
        unused_X = []
        for i in range(len(formerX)):
            newXrow = formerX[i]

            found = False
            for j in range(len(new_X)):
                checkXrow = new_X[j]
                if (newXrow == checkXrow).all():
                    found = True
                    break
            if not found:
                unused_X.append(newXrow)

        # add missing features
        from sklearn.feature_selection import RFE

        # n_features_to_select means that this many features will REMAIN afterwards, not that they are selected for removal
        feature = RFE(algorithm,
                      n_features_to_select=n_components - new_X.shape[0])

        unused_X = np.array(unused_X).transpose()
        add_me = feature.fit_transform(unused_X, newY)
        # merge
        new_X = np.concatenate((new_X.transpose(), add_me), axis=1)

    if merged:
        new_X = np.reshape(new_X, (X.shape[0], X.shape[1], X.shape[2], -1))
    else:
        new_X = np.reshape(new_X, (X.shape[0], X.shape[1], -1))

    return new_X, feature
new_data = pd.DataFrame(data["data"], columns=data["feature_names"])
print(new_data)

target = data.target
print(target)

new_data = pd.concat(
    [new_data, pd.DataFrame(target, columns=["target"])], axis=1)
print(new_data)

new_data.columns = [
    "sepal_length", "sepal_width", "petal_length", " petal_width", "target"
]
print(new_data)

X = new_data.drop(columns=["target"])
y = new_data["target"]
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
#for backward selection
sbs = SFS(LinearRegression(), k_features=2)
bckwrd = sbs.fit(X, y)

print(bckwrd.k_feature_names_)

# for forward selecion
sfs = SFS(LinearRegression(), k_features=2, forward=True)
forward_slection = sfs.fit_transform(X, y)

print(sfs.k_feature_names_)